doxygen/html/parallel__algorithms_8hpp_source.html

#ifndef LIBJMMCG_CORE_PRIVATE_PARALLEL_ALGORITHMS_HPP

#define LIBJMMCG_CORE_PRIVATE_PARALLEL_ALGORITHMS_HPP


/******************************************************************************

** Copyright © 2004 by J.M.McGuiness, coder@hussar.me.uk

**

** This library is free software; you can redistribute it and/or

** modify it under the terms of the GNU Lesser General Public

** License as published by the Free Software Foundation; either

** version 2.1 of the License, or (at your option) any later version.

**

** This library is distributed in the hope that it will be useful,

** but WITHOUT ANY WARRANTY; without even the implied warranty of

** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

** Lesser General Public License for more details.

**

** You should have received a copy of the GNU Lesser General Public

** License along with this library; if not, write to the Free Software

** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

*/


#include "manage_container_args.hpp"

#include "thread_client_context.hpp"

#include "thread_dsel_types.hpp"

#include "../../core/config.h"

#include "../../core/shared_ptr.hpp"


#include <boost/function.hpp>


namespace jmmcg { namespace LIBJMMCG_VER_NAMESPACE { namespace ppd { namespace private_ {


   const char shuffle_str[]="shuffle";

   const char lhs_merge_str[]="lhs_merge";

   const char rhs_merge_str[]="rhs_merge";

   const char combine1_str[]="combine1";

   const char combine2_str[]="combine2";

   const char ascending_lhs_str[]="ascending_lhs";

   const char descending_rhs_str[]="descending_rhs";

   const char merge_str[]="merge";

   const char arg_str[]="arg";

   const char lhs_str[]="lhs";

   const char rhs_str[]="rhs";

   const char unary_fun_str[]="unary_fun";

   const char binary_fun_str[]="binary_fun";


   namespace alg_wk_wrap {


      template<class V>

      struct pass_value {

         typedef void result_type;

         typedef V element_type;


         element_type value;


         explicit constexpr pass_value(element_type &r) FORCE_INLINE

         : value(r) {

         }


         constexpr void __fastcall process() noexcept(true) FORCE_INLINE {

         }


         constexpr bool __fastcall operator<(pass_value const &) const noexcept(true) FORCE_INLINE {

            return true;

         }


         template<class CoreWk>

         static constexpr void FORCE_INLINE resize_output(CoreWk &) noexcept(true) {}

      };


      /// Assist with implementing the parallel versions of the standard algorithms.

      /**

         \see for_each_reduce

      */

      template<class Op>

      struct for_each_work_type {

         typedef void result_type;

         typedef Op operation_type;


         /**

            To assist in allowing compile-time computation of the algorithmic order of the threading model.

         */

         static constexpr ppd::generic_traits::memory_access_modes memory_access_mode=ppd::generic_traits::memory_access_modes::crew_memory_access;


         /**

            Need this to be non-const, in case pointer-types get stuffed in here, otherwise the compiler will complain (not unreasonably) about the const-ness.

         */

         operation_type op;


         constexpr for_each_work_type() noexcept(true) FORCE_INLINE {

         }

         explicit constexpr for_each_work_type(operation_type const &o) noexcept(true) FORCE_INLINE

         : op(o) {

         }


         constexpr void __fastcall process() noexcept(true) FORCE_INLINE {

         }


         constexpr bool __fastcall operator<(for_each_work_type const &) const noexcept(true) FORCE_INLINE {

            return true;

         }


         template<class CoreWk>

         static constexpr void FORCE_INLINE resize_output(CoreWk &) noexcept(true) {}

      };


      /// Assist with implementing the parallel versions of the standard algorithms.

      /**

         \see for_each_reduce

      */

      template<class Op>

      struct transform_work_type {

         typedef void result_type;

         typedef Op operation_type;


         /**

            To assist in allowing compile-time computation of the algorithmic order of the threading model.

         */

         static constexpr ppd::generic_traits::memory_access_modes memory_access_mode=ppd::generic_traits::memory_access_modes::crew_memory_access;


         /**

            Need this to be non-const, in case pointer-types get stuffed in here, otherwise the compiler will complain (not unreasonably) about the const-ness.

         */

         operation_type op;


         constexpr transform_work_type() noexcept(true) FORCE_INLINE {

         }

         explicit constexpr transform_work_type(operation_type const &o) noexcept(true) FORCE_INLINE

         : op(o) {

         }


         constexpr void __fastcall process() noexcept(true) FORCE_INLINE {

         }


         constexpr bool __fastcall operator<(transform_work_type const &) const noexcept(true) FORCE_INLINE {

            return true;

         }


         template<class CoreWk>

         static void FORCE_INLINE resize_output(CoreWk &wk) noexcept(false) {

            wk.resize_output(wk.work_complete()->containers().input1.size());

         }

      };


      /// Assist with implementing the parallel versions of the standard algorithms.

      /**

         Note that this operation should operate on an output range that no-other thread should modify, i.e. that range should have at least a read-lock taken on it.


         \see for_each_work_type

         \see thread_base::for_each

         \see thread_base::alg_wrapper1

      */

      template<class Conts, typename Fn>

      struct for_each_reduce {

         typedef Fn operation_type;

         typedef typename operation_type::result_type result_type;

         typedef Conts containers_type;

         typedef typename containers_type::in_iterator in_iterator;


         /**

            To assist in allowing compile-time computation of the algorithmic order of the threading model.

         */

         static constexpr ppd::generic_traits::memory_access_modes memory_access_mode=ppd::generic_traits::memory_access_modes::crew_memory_access;


         constexpr for_each_reduce(in_iterator const &b, in_iterator const &e, operation_type &w) FORCE_INLINE

         : beg(b), end(e), fn(w) {

         }


         void __fastcall process() const FORCE_INLINE {

            std::for_each(beg, end, fn.input().op);

         }


         constexpr bool __fastcall operator<(for_each_reduce const &) const noexcept(true) FORCE_INLINE {

            return true;

         }


      private:

         const in_iterator beg, end;

         operation_type &fn;

      };


      /// Assist with implementing the parallel versions of the standard algorithms.

      /**

         \see count_if_reduce

         \see thread_base::count_if_t

         \see thread_base::alg_wrapper1

      */

      template<typename Pred, typename CTR>

      struct countor_work_type {

         typedef CTR result_type;

         typedef Pred operation_type;


         /**

            To assist in allowing compile-time computation of the algorithmic order of the threading model.

         */

         static constexpr ppd::generic_traits::memory_access_modes memory_access_mode=result_type::memory_access_mode;


         operation_type const pred;


         explicit constexpr countor_work_type(operation_type const &p) noexcept(true) FORCE_INLINE

         : pred(p) {

         }


         constexpr void __fastcall process(result_type &) noexcept(true) FORCE_INLINE {

         }


         constexpr bool __fastcall operator<(countor_work_type const &) const noexcept(true) FORCE_INLINE {

            return true;

         }


         template<class T>

         static constexpr void FORCE_INLINE resize_output(T const &) noexcept(true) {}

      };


      /// Assist with implementing the parallel versions of the standard algorithms.

      /**

         Implements a reduction operation.

         Note that this operation should operate on an output range that no-other thread should modify, i.e. that range should have at least a read-lock taken on it..


         \see countor_work_type

         \see thread_base::count_if_t

         \see thread_base::alg_wrapper1

      */

      template<class Conts, typename CtrPred>

      struct count_if_reduce {

         typedef CtrPred operation_type;

         typedef typename operation_type::result_type result_type;

         typedef Conts containers_type;

         typedef typename containers_type::in_iterator in_iterator;


         /**

            To assist in allowing compile-time computation of the algorithmic order of the threading model.

         */

         static constexpr ppd::generic_traits::memory_access_modes memory_access_mode=(

            operation_type::memory_access_mode==ppd::generic_traits::memory_access_modes::crew_memory_access

            ? ppd::generic_traits::memory_access_modes::crew_memory_access

            : ppd::generic_traits::memory_access_modes::erew_memory_access

         );


         constexpr count_if_reduce(in_iterator const &b, in_iterator const &e, operation_type &w) noexcept(true) FORCE_INLINE

         : beg(b), end(e), fn(w) {

         }


         void __fastcall process() FORCE_INLINE;


         constexpr bool __fastcall operator<(count_if_reduce const &) const noexcept(true) FORCE_INLINE {

            return true;

         }


      private:

         const in_iterator beg, end;

         operation_type &fn;

      };


      /// Assist with implementing the parallel versions of the standard algorithms.

      /**

         \see accumulate_reduce

      */

      template<typename BinOp, typename Acc>

      struct accumulator_work_type {

         typedef Acc result_type;

         typedef BinOp operation_type;


         /**

            To assist in allowing compile-time computation of the algorithmic order of the threading model.

         */

         static constexpr ppd::generic_traits::memory_access_modes memory_access_mode=result_type::memory_access_mode;


         operation_type const binop;

         result_type const init;


         constexpr accumulator_work_type(operation_type const &p, result_type &&i) noexcept(true) FORCE_INLINE

         : binop(p), init(std::forward<result_type>(i)) {

         }


         constexpr void process(result_type &) noexcept(true) FORCE_INLINE {

         }


         constexpr bool __fastcall operator<(accumulator_work_type const &) const noexcept(true) FORCE_INLINE {

            return true;

         }


         template<class T>

         static void FORCE_INLINE resize_output(T const &) noexcept(true) {}

      };


      /// Assist with implementing the parallel versions of the standard algorithms.

      /**

         Implements a reduction operation.

         Note that this operation should operate on an output range that no-other thread should modify, i.e. that range should have at least a read-lock taken on it.


         \see accumulator_work_type

         \see thread_base::accumulate_t

         \see thread_base::alg_wrapper1

      */

      template<class Conts, typename Fn>

      struct accumulate_reduce {

         typedef Fn operation_type;

         typedef typename operation_type::result_type result_type;

         typedef Conts containers_type;

         typedef typename containers_type::in_iterator in_iterator;


         /**

            To assist in allowing compile-time computation of the algorithmic order of the threading model.

         */

         static constexpr ppd::generic_traits::memory_access_modes memory_access_mode=(

            operation_type::memory_access_mode==ppd::generic_traits::memory_access_modes::crew_memory_access

            ? ppd::generic_traits::memory_access_modes::crew_memory_access

            : ppd::generic_traits::memory_access_modes::erew_memory_access

         );


         constexpr __stdcall accumulate_reduce(in_iterator const &b, in_iterator const &e, operation_type &w) FORCE_INLINE

         : beg(b), end(e), fn(w) {

         }


         void __fastcall process() FORCE_INLINE;


         constexpr bool __fastcall operator<(accumulate_reduce const &) const noexcept(true) FORCE_INLINE {

            return true;

         }


      private:

         const in_iterator beg, end;

         operation_type &fn;

      };


      /// Assist with implementing the parallel versions of the standard algorithms.

      /**

         Note how, if the input collection is a collection of unique elements, only one item will write to the output, so no need to implement any locking on the output. Also note that once the item is found it is implementation-defined how far the search continues within the remaining range.

         Note that this operation should operate on an output range that no-other thread should modify, i.e. that range should have at least a read-lock taken on it.


         \todo It would be nice, if once found, this could cancel any pending tasks.


         \see countor_work_type

         \see thread_base::find_if_t

         \see thread_base::alg_wrapper1

      */

      template<class Conts, typename CtrPred>

      struct find_if_reduce {

         typedef CtrPred operation_type;

         typedef typename operation_type::result_type result_type;

         typedef Conts containers_type;

         typedef typename containers_type::in_iterator in_iterator;


         /**

            To assist in allowing compile-time computation of the algorithmic order of the threading model.

         */

         static constexpr ppd::generic_traits::memory_access_modes memory_access_mode=(

            operation_type::memory_access_mode==ppd::generic_traits::memory_access_modes::crew_memory_access

            ? ppd::generic_traits::memory_access_modes::crew_memory_access

            : ppd::generic_traits::memory_access_modes::erew_memory_access

         );


         constexpr __stdcall find_if_reduce(in_iterator const &b, in_iterator const &e, operation_type &w) FORCE_INLINE;


         void __fastcall process() FORCE_INLINE;


         constexpr bool __fastcall operator<(find_if_reduce const &) const noexcept(true) FORCE_INLINE {

            return true;

         }


      private:

         const in_iterator beg, end;

         operation_type &fn;

      };


      /// Assist with implementing the parallel versions of the standard algorithms.

      /**

         Implements a reduction operation.

         Note that this operation should operate on an output range that no-other thread should modify, i.e. that range should have at least a read-lock taken on it.


         \see accumulator_work_type

         \see thread_base::max_element_t

         \see thread_base::alg_wrapper1

      */

      template<class Conts, typename Fn>

      class max_element_reduce {

      public:

         typedef Fn operation_type;

         typedef typename operation_type::result_type result_type;

         typedef Conts containers_type;

         typedef typename containers_type::input_t::container_type container_type;

         typedef typename containers_type::in_iterator in_iterator;


         /**

            To assist in allowing compile-time computation of the algorithmic order of the threading model.

         */

         static constexpr ppd::generic_traits::memory_access_modes memory_access_mode=(

            operation_type::memory_access_mode==ppd::generic_traits::memory_access_modes::crew_memory_access

            ? ppd::generic_traits::memory_access_modes::crew_memory_access

            : ppd::generic_traits::memory_access_modes::erew_memory_access

         );


         constexpr __stdcall max_element_reduce(in_iterator const &b, in_iterator const &e, operation_type &w) FORCE_INLINE;


         void __fastcall process() FORCE_INLINE;


         constexpr bool __fastcall operator<(max_element_reduce const &) const noexcept(true) FORCE_INLINE {

            return true;

         }


      private:

         class max;


         const in_iterator beg, end;

         operation_type &fn;

      };


      /// Assist with implementing the parallel versions of the standard algorithms.

      /**

         Implements a reduction operation.

         Note that this operation should operate on an output range that no-other thread should modify, i.e. that range should have at least a read-lock taken on it.


         \see accumulator_work_type

         \see thread_base::min_element_t

         \see thread_base::alg_wrapper1

      */

      template<class Conts, typename Fn>

      class min_element_reduce {

      public:

         typedef Fn operation_type;

         typedef typename operation_type::result_type result_type;

         typedef Conts containers_type;

         typedef typename containers_type::input_t::container_type container_type;

         typedef typename containers_type::in_iterator in_iterator;


         /**

            To assist in allowing compile-time computation of the algorithmic order of the threading model.

         */

         static constexpr ppd::generic_traits::memory_access_modes memory_access_mode=(

            operation_type::memory_access_mode==ppd::generic_traits::memory_access_modes::crew_memory_access

            ? ppd::generic_traits::memory_access_modes::crew_memory_access

            : ppd::generic_traits::memory_access_modes::erew_memory_access

         );


         constexpr __stdcall min_element_reduce(in_iterator const &b, in_iterator const &e, operation_type &w) FORCE_INLINE;


         void __fastcall process() FORCE_INLINE;


         constexpr bool __fastcall operator<(min_element_reduce const &) const noexcept(true) FORCE_INLINE {

            return true;

         }


      private:

         class min;


         const in_iterator beg, end;

         operation_type &fn;

      };


      /// Assist with implementing the parallel versions of the standard algorithms.

      /**

         Note that this operation should operate on an output range that no-other thread should modify, i.e. that range should have at least a read-lock taken on it.


         \see for_each_work_type

         \see thread_base::transform_t

         \see thread_base::alg_wrapper2

      */

      template<class Conts, typename UniOp>

      struct transform_reduce {

         typedef UniOp operation_type;

         typedef typename operation_type::result_type result_type;

         typedef Conts containers_type;

         typedef typename containers_type::in_iterator in_iterator;

         typedef typename containers_type::out_iterator out_iterator;


         /**

            To assist in allowing compile-time computation of the algorithmic order of the threading model.

         */

         static constexpr ppd::generic_traits::memory_access_modes memory_access_mode=ppd::generic_traits::memory_access_modes::crew_memory_access;


         constexpr __stdcall transform_reduce(in_iterator const &ib, in_iterator const &ie, out_iterator const &o, operation_type const &w) FORCE_INLINE

         : in_beg(ib), in_end(ie), out(o), fn(w) {

         }


         void __fastcall process() FORCE_INLINE {

            std::transform(in_beg, in_end, out, fn.input().op);

         }


         constexpr bool __fastcall operator<(transform_reduce const &) const noexcept(true) FORCE_INLINE {

            return true;

         }


      private:

         const in_iterator in_beg, in_end;

         out_iterator out;

         const operation_type &fn;

      };


      /// Assist with implementing the parallel versions of the standard algorithms.

      /**

         Note that this operation should operate on an output range that no-other thread should modify, i.e. that range should have at least a read-lock taken on it.


         \see for_each_work_type

         \see thread_base::transform2_t

         \see thread_base::alg_wrapper2

      */

      template<class Conts, typename BinOp>

      struct transform2_reduce {

         typedef BinOp operation_type;

         typedef typename operation_type::result_type result_type;

         typedef Conts containers_type;

         typedef typename containers_type::in_iterator in_iterator;

         typedef typename containers_type::in2_iterator in2_iterator;

         typedef typename containers_type::out_iterator out_iterator;


         /**

            To assist in allowing compile-time computation of the algorithmic order of the threading model.

         */

         static constexpr ppd::generic_traits::memory_access_modes memory_access_mode=ppd::generic_traits::memory_access_modes::crew_memory_access;


         constexpr __stdcall transform2_reduce(in_iterator const &i1b, in_iterator const &i1e, in2_iterator const &i2b, out_iterator const &o, operation_type const &w) FORCE_INLINE

         : in1_beg(i1b), in1_end(i1e), in2_beg(i2b), iter_out(o), fn(w) {

         }


         void __fastcall process() FORCE_INLINE {

            std::transform(in1_beg, in1_end, in2_beg, iter_out, fn.input().op);

         }


         constexpr bool __fastcall operator<(transform2_reduce const &) const noexcept(true) FORCE_INLINE {

            return true;

         }


      private:

         const in_iterator in1_beg, in1_end;

         const in2_iterator in2_beg;

         out_iterator iter_out;

         const operation_type &fn;

      };


      /// Assist with implementing the parallel versions of the standard algorithms.

      /**

         \see reverse_reduce

      */

      template<class Colln>

      struct reverse_work_type {

         typedef std::pointer_to_binary_function<typename Colln::container_type::iterator, typename Colln::container_type::iterator, void> operation_type;

         typedef typename operation_type::first_argument_type first_argument_type;

         typedef typename operation_type::second_argument_type second_argument_type;

         typedef typename operation_type::result_type result_type;


         operation_type const binop;


         constexpr reverse_work_type() noexcept(true) FORCE_INLINE

         : binop(&std::iter_swap<first_argument_type, second_argument_type>) {

         }

         constexpr reverse_work_type(typename Colln::container_type::iterator cb, typename Colln::container_type::iterator ce) noexcept(true) FORCE_INLINE

         : binop(&std::iter_swap<first_argument_type, second_argument_type>), cont_beg_(cb), cont_end_(ce) {

         }


         constexpr void __fastcall process() noexcept(true) FORCE_INLINE {

         }


         constexpr typename Colln::container_type::iterator __fastcall cont_beg() const noexcept(true) FORCE_INLINE {

            return cont_beg_;

         }

         constexpr typename Colln::container_type::iterator __fastcall cont_end() const noexcept(true) FORCE_INLINE {

            return cont_end_;

         }


         constexpr bool __fastcall operator<(reverse_work_type const &) const noexcept(true) FORCE_INLINE {

            return true;

         }


         template<class CoreWk>

         static constexpr void FORCE_INLINE resize_output(CoreWk &) noexcept(true) {

         }


      private:

         typename Colln::container_type::iterator cont_beg_;

         typename Colln::container_type::iterator cont_end_;

      };

      /// Assist with implementing the parallel versions of the standard algorithms.

      /**

         Note that this operation should operate on an output range that no-other thread should modify, i.e. that range should have at least a read-lock taken on it.


         \see reverse_work_type

         \see thread_base::reverse_t

         \see thread_base::alg_wrapper1

      */

      template<class Conts, typename Fn>

      struct reverse_reduce {

         typedef Fn operation_type;

         typedef typename operation_type::result_type result_type;

         typedef Conts containers_type;

         typedef typename containers_type::in_iterator in_iterator;


         /**

            To assist in allowing compile-time computation of the algorithmic order of the threading model.

         */

         static constexpr ppd::generic_traits::memory_access_modes memory_access_mode=ppd::generic_traits::memory_access_modes::crew_memory_access;


         constexpr __stdcall reverse_reduce(in_iterator const &bs, in_iterator const &es, operation_type const &w) FORCE_INLINE;


         void __fastcall process() const FORCE_INLINE;


         constexpr bool __fastcall operator<(reverse_reduce const &) const noexcept(true) FORCE_INLINE {

            return true;

         }


      private:

         const in_iterator beg_subrange, end_subrange;

         const operation_type &fn;

         const typename std::iterator_traits<in_iterator>::difference_type cont_size;

      };


      /// Assist with implementing the parallel versions of the standard algorithms.

      /**

         Note that this operation should operate on an output range that no-other thread should modify, i.e. that range should have at least a read-lock taken on it.


         \see pass_value

         \see thread_base::fill_n

         \see thread_base::alg_wrapper1

      */

      template<typename Conts, class UniOp>

      struct fill_n_reduce {

         typedef UniOp operation_type;

         typedef typename operation_type::result_type result_type;

         typedef Conts containers_type;

         typedef typename containers_type::in_iterator in_iterator;


         /**

            To assist in allowing compile-time computation of the algorithmic order of the threading model.

         */

         static constexpr ppd::generic_traits::memory_access_modes memory_access_mode=ppd::generic_traits::memory_access_modes::crew_memory_access;


         constexpr __stdcall fill_n_reduce(in_iterator b, in_iterator e, operation_type const &op) FORCE_INLINE

         : beg(b), end(e), val(op) {

         }


         void __fastcall process() const FORCE_INLINE;


         constexpr bool __fastcall operator<(fill_n_reduce const &) const noexcept(true) FORCE_INLINE {

            return true;

         }


      private:

         in_iterator beg, end;

         operation_type const &val;

      };


      /// Assist with implementing the parallel versions of the standard algorithms.

      /**

         Note that this operation should operate on an output range that no-other thread should modify, i.e. that range should have at least a read-lock taken on it.


         \see pass_value

         \see thread_base::fill

         \see thread_base::alg_wrapper1

      */

      template<typename Conts, class UniOp>

      struct fill_reduce {

         typedef UniOp operation_type;

         typedef typename operation_type::result_type result_type;

         typedef Conts containers_type;

         typedef typename containers_type::in_iterator in_iterator;


         /**

            To assist in allowing compile-time computation of the algorithmic order of the threading model.

         */

         static constexpr ppd::generic_traits::memory_access_modes memory_access_mode=ppd::generic_traits::memory_access_modes::crew_memory_access;


         constexpr __stdcall fill_reduce(in_iterator b, in_iterator e, operation_type const &op) FORCE_INLINE

         : beg(b), end(e), val(op) {

         }


         void __fastcall process() const FORCE_INLINE;


         constexpr bool __fastcall operator<(fill_reduce const &) const noexcept(true) FORCE_INLINE {

            return true;

         }


      private:

         in_iterator beg, end;

         operation_type const &val;

      };


      /// Assist with implementing the parallel versions of the standard algorithms.

      /**

         Note that this operation should operate on an output range that no-other thread should modify, i.e. that range should have at least a read-lock taken on it.


         \see for_each_work_type

         \see thread_base::swap_ranges_t

         \see thread_base::alg_wrapper1

      */

      template<class Conts, typename Pred>

      struct swap_ranges_reduce {

         typedef Pred operation_type;

         typedef typename operation_type::result_type result_type;

         typedef Conts containers_type;

         typedef typename containers_type::in_iterator in_iterator;

         typedef typename containers_type::out_iterator out_iterator;


         /**

            To assist in allowing compile-time computation of the algorithmic order of the threading model.

         */

         static constexpr ppd::generic_traits::memory_access_modes memory_access_mode=ppd::generic_traits::memory_access_modes::crew_memory_access;


         constexpr __stdcall swap_ranges_reduce(out_iterator const &b1, in_iterator const &e1, out_iterator const &b2, operation_type const &f) FORCE_INLINE

         : begin1(b1), end1(e1), begin2(b2), fn(f) {

         }


         void __fastcall process() FORCE_INLINE;


         constexpr bool __fastcall operator<(swap_ranges_reduce const &) const noexcept(true) FORCE_INLINE {

            return true;

         }


      private:

         out_iterator begin1;

         const in_iterator end1;

         out_iterator begin2;

         operation_type const &fn;

      };


      /// Assist with implementing the parallel versions of the standard algorithms.

      /**

         \see merge_reduce

      */

      template<class Comp, class TPB>

      struct merge_work_type {

         typedef void result_type;

         typedef Comp operation_type;

         typedef TPB thread_pool_type;


         operation_type const comp;

         thread_pool_type &pool;


         constexpr __stdcall merge_work_type(operation_type const &o, thread_pool_type &p) noexcept(true) FORCE_INLINE

         : comp(o), pool(p) {

         }


         constexpr void __fastcall process() noexcept(true) FORCE_INLINE {

         }


         constexpr bool __fastcall operator<(merge_work_type const &) const noexcept(true) FORCE_INLINE {

            return true;

         }


         template<class CoreWk>

         static void resize_output(CoreWk &wk) noexcept(false) FORCE_INLINE;

      };

      /*

         These bits are so damn ugly I want to throw up, but that's optimisations for you....

         They can't be members of batchers_bitonic_merge_reduce because of the template-template parameter to the batchers_bitonic_merge_reduce::merge class, also which needs to be specified in the sort algorithm.

      */

      /// The direction of the resultant output sequence from a merge or sort opration.

      enum class direction {

         ascending,

         descending

      };

      /// The comparator operator to be used within the merge or sort operation.

      template<direction Dir, class out_iterator, class Closure>

      class swap_pred : public std::binary_function<typename out_iterator::value_type, typename out_iterator::value_type, bool> {

      public:

         static constexpr direction dir=Dir;


         explicit constexpr swap_pred(Closure const &c) noexcept(true) FORCE_INLINE

         : arg(c) {}


         bool __fastcall operator()(typename out_iterator::value_type const &lhs, typename out_iterator::value_type const &rhs) const noexcept(noexcept(std::declval<typename Closure::argument_type>().comp(lhs, rhs))) FORCE_INLINE;


      private:

         typename Closure::argument_type const &arg;

      };

      /// Merge operations are predicated upon the two input queues being sorted, so we can improve the algorithmic complexity by making use of std::merge() to merge the final sub-ranges in O(n/p) time. Note that the input is a bitonic sub-range, which makes this algorithm more complex.

      template<class Iter, class operation_type, direction LHSDir, direction RHSDir, class Dummy>

      struct merge_final_sorter {

         static constexpr direction lhs_dir=LHSDir;

         static constexpr direction rhs_dir=RHSDir;

         typedef Iter out_iterator;

         typedef typename out_iterator::difference_type out_sz_t;

         typedef swap_pred<rhs_dir, out_iterator, operation_type> swapper_t;

         typedef boost::function<void (out_iterator, out_iterator, std::binary_negate<swapper_t>)> sort_fn_t;

         typedef out_iterator arg1_type;

         typedef out_iterator arg2_type;

         typedef std::binary_negate<swapper_t> arg3_type;


         /**

            \todo What about std::inplace_merge()?


            \see std::merge()

         */

         static void __fastcall process(Dummy const &, out_iterator const begin, out_iterator const end, operation_type const &fn) noexcept(false) FORCE_INLINE;

      };

      template<class Iter, class operation_type, direction LHSDir, direction RHSDir, class SortFn>

      struct sort_final_sorter {

         static constexpr direction lhs_dir=LHSDir;

         static constexpr direction rhs_dir=RHSDir;

         typedef Iter out_iterator;

         typedef typename out_iterator::difference_type out_sz_t;

         typedef swap_pred<lhs_dir, out_iterator, operation_type> swapper_t;

         typedef SortFn sort_fn_t;

         typedef typename sort_fn_t::arg1_type arg1_type;

         typedef typename sort_fn_t::arg2_type arg2_type;

         typedef typename sort_fn_t::arg3_type arg3_type;


         static void __fastcall process(sort_fn_t const &sfn, out_iterator const begin, out_iterator const end, operation_type const &fn) noexcept(false) FORCE_INLINE {

            sfn(begin, end, arg3_type(swapper_t(fn)));

         }

      };

      /// Assist with implementing the parallel versions of the standard algorithms.

      /**

         Note that this operation should operate on an output range that no-other thread should modify, i.e. that range should have at least a read-lock taken on it.


         [1] <a href="http://www.cs.uoregon.edu/research/paraducks/papers/psc94.d/node2.html"/>


         \see merge_work_type

         \see thread_base::merge_t

         \see thread_base::alg_wrapper2

      */

      template<class Conts, typename Comp>

      class batchers_bitonic_merge_reduce {

      public:

         typedef Comp operation_type;

         typedef typename operation_type::argument_type::thread_pool_type thread_pool_type;

         typedef typename thread_pool_type::exception_type exception_type;

         typedef typename thread_pool_type::pool_traits_type pool_traits_type;

         typedef typename thread_pool_type::joinable joinable;

         typedef typename operation_type::result_type result_type;

         typedef Conts containers_type;

         typedef typename containers_type::in_iterator in_iterator;

         typedef typename containers_type::in2_iterator in2_iterator;

         typedef typename containers_type::out_iterator out_iterator;

         typedef typename out_iterator::difference_type out_sz_t;

         template<

            direction LHSDir,

            direction RHSDir,

            template<class, class, direction, direction, class> class FinalSort  ///< Ugly variation point for introducing an optimisation. Merging is predicated upon sorted inputs, unlike sort, so merging the final sub-ranges can be done in O(n/p) time, which is faster than O(nlog(n)/p) time, and noticeable in testing.

         >

         class merge;


      private:

         /**

            Make use of std::merge() to merge the sub-collections in O(n/p) time, which we can do, as the input collections must be sorted as a precondition for the algorithm.


            \see merge_final_sorter

         */

         typedef merge<direction::ascending, direction::ascending, merge_final_sorter> init_merger_t;

         typedef typename init_merger_t::sort_fn_t sort_fn_t;


         void combine();


      public:

         /**

            To assist in allowing compile-time computation of the algorithmic order of the threading model.

         */

         static constexpr ppd::generic_traits::memory_access_modes memory_access_mode=(

            thread_pool_type::template copy_iter_t<typename containers_type::input1_t::container_type, typename containers_type::output_t::container_type, typename containers_type::output_t::container_type::container_type::iterator>::memory_access_mode==ppd::generic_traits::memory_access_modes::crew_memory_access

            && init_merger_t::memory_access_mode==ppd::generic_traits::memory_access_modes::crew_memory_access

            ? ppd::generic_traits::memory_access_modes::crew_memory_access

            : ppd::generic_traits::memory_access_modes::erew_memory_access

         );


         /**

            Complexity: O(log^2(O(sfn)/p+log(p)))

         */

         batchers_bitonic_merge_reduce(containers_type &c, operation_type const &w, cliques::element_type const cl) noexcept(true) FORCE_INLINE;

         virtual ~batchers_bitonic_merge_reduce() noexcept(true) FORCE_INLINE {}


         /**

            If std::stable_sort() is used then this is on average O(n.log(n)), with enough memory.

         */

         void __fastcall process();


         constexpr bool __fastcall operator<(batchers_bitonic_merge_reduce const &) const noexcept(true) FORCE_INLINE {

            return true;

         }


      private:

         containers_type &conts;

         operation_type const &fn;

         cliques::element_type const clique;

      };


      /// Assist with implementing the parallel versions of the standard algorithms.

      /**

         \see sort_reduce

      */

      template<class Comp, class TPB>

      struct sort_work_type {

         typedef void result_type;

         typedef Comp operation_type;

         typedef TPB thread_pool_type;


         operation_type const comp;

         thread_pool_type &pool;


         constexpr __stdcall sort_work_type(operation_type const &o, thread_pool_type &p) noexcept(true) FORCE_INLINE

         : comp(o), pool(p) {

         }


         constexpr void __fastcall process() noexcept(true) FORCE_INLINE {

         }


         constexpr bool __fastcall operator<(sort_work_type const &) const noexcept(true) FORCE_INLINE {

            return true;

         }


         template<class CoreWk>

         static constexpr void FORCE_INLINE resize_output(CoreWk &) noexcept(true) {

         }

      };

      /// Assist with implementing the parallel versions of the standard algorithms.

      /**

         Note that this operation should operate on an output range that no-other thread should modify, i.e. that range should have at least a read-lock taken on it.


         [1] <a href="http://www.iti.fh-flensburg.de/lang/algorithmen/sortieren/bitonic/bitonicen.htm"/>


         \see sort_work_type

         \see thread_base::sort_t

         \see thread_base::alg_wrapper1

      */

      template<typename Conts, class Comp>

      struct bitonic_sort_reduce {

         typedef Comp operation_type;

         typedef typename operation_type::argument_type::thread_pool_type thread_pool_type;

         typedef typename thread_pool_type::pool_traits_type pool_traits_type;

         typedef typename operation_type::result_type result_type;

         typedef Conts containers_type;

         typedef typename containers_type::in_iterator in_iterator;

         typedef batchers_bitonic_merge_reduce<three_containers<typename containers_type::input_t::container_type, typename containers_type::input_t::container_type, typename containers_type::input_t::container_type>, Comp> merge_t;

         template<direction dir>

         class sort;

         typedef sort<direction::ascending> init_sorter_t;


         /**

            To assist in allowing compile-time computation of the algorithmic order of the threading model.

         */

         static constexpr ppd::generic_traits::memory_access_modes memory_access_mode=init_sorter_t::memory_access_mode;


         constexpr bitonic_sort_reduce(containers_type &c, operation_type const &op, cliques::element_type const cl) noexcept(true) FORCE_INLINE;

         virtual ~bitonic_sort_reduce() noexcept(true) FORCE_INLINE {

         }


         void __fastcall process() const FORCE_INLINE;


         constexpr bool __fastcall operator<(bitonic_sort_reduce const &) const noexcept(true) FORCE_INLINE {

            return true;

         }


      private:

         containers_type &cont;

         operation_type const &fn;

         cliques::element_type const clique;

      };


   }


   template<class T>

   struct stl_functor_result_type {

      typedef T value_type;


      /**

         To assist in allowing compile-time computation of the algorithmic order of the threading model.

      */

      static constexpr ppd::generic_traits::memory_access_modes memory_access_mode=ppd::generic_traits::memory_access_modes::crew_memory_access;


      value_type result;


      constexpr __stdcall stl_functor_result_type() noexcept(true) FORCE_INLINE {

      }

      constexpr __stdcall stl_functor_result_type(value_type &&r) noexcept(true) FORCE_INLINE

      : result(std::forward<value_type>(r)) {

      }

      /// Note the use of an automatic conversion here.

      constexpr __fastcall operator value_type const &() const noexcept(true) FORCE_INLINE {

         return result;

      }


      bool __fastcall operator<(stl_functor_result_type const &rhs) const noexcept(true) FORCE_INLINE {

         return result<rhs.result;

      }

   };


   /// An adaptor to allow STL unary functions to be operated upon in the thread_pool.

   /**

      Note that the input is evaluated by transferring it into the pool, and the execution_context that holds the result has an automatic conversion to the result_type.

   */

   template<class ArgT, class UniFn, class PT>

   class unary_fun_work_type {

   public:

      typedef PT pool_type;

      typedef UniFn operation_type;

      typedef stl_functor_result_type<typename operation_type::result_type> result_type;

      typedef ArgT argument_type;


   private:

      struct arg_int_work_type;


      struct arg_context_t;


      using shared_ptr_t=shared_ptr<arg_context_t, api_lock_traits<platform_api, sequential_mode>>;


   public:

      /**

         To assist in allowing compile-time computation of the algorithmic order of the threading model.

      */

      static constexpr ppd::generic_traits::memory_access_modes memory_access_mode=(

         shared_ptr_t::memory_access_mode==ppd::generic_traits::memory_access_modes::crew_memory_access

         ? ppd::generic_traits::memory_access_modes::crew_memory_access

         : ppd::generic_traits::memory_access_modes::erew_memory_access

      );


      __stdcall unary_fun_work_type(argument_type &&a, operation_type const &o, pool_type &pool) noexcept(false) FORCE_INLINE;


      void __fastcall process(result_type &r) FORCE_INLINE;


      bool __fastcall operator<(unary_fun_work_type const &rhs) const noexcept(true) FORCE_INLINE;


   private:

      operation_type op;

      /// \todo This is done to prevent copying the execution contexts. If we have a transfer ctor, then we can avoid the copy. But we need to consider the fact that if the work has completed mutation, would this have a problem if the transfer is also occurring.

      shared_ptr_t arg_cxt;

   };


   /// An adaptor to allow STL binary functions to be operated upon in the thread_pool.

   /**

      Note that the inputs are evaluated by transferring them into the pool, and the execution_context that holds the result has an automatic conversion to the result_type.

   */

   template<class ArgT1, class ArgT2, class BinFn, class PT>

   class binary_fun_work_type {

   public:

      typedef PT pool_type;

      typedef BinFn operation_type;

      typedef stl_functor_result_type<typename operation_type::result_type> result_type;

      typedef ArgT1 first_argument_type;

      typedef ArgT2 second_argument_type;


   private:

      template<class Arg>

      struct arg_int_work_type;


      struct arg_contexts_t;


      using shared_ptr_t=shared_ptr<arg_contexts_t, api_lock_traits<platform_api, sequential_mode>>;


   public:

      /**

         To assist in allowing compile-time computation of the algorithmic order of the threading model.

      */

      static constexpr ppd::generic_traits::memory_access_modes memory_access_mode=(

         shared_ptr_t::memory_access_mode==ppd::generic_traits::memory_access_modes::crew_memory_access

         ? ppd::generic_traits::memory_access_modes::crew_memory_access

         : ppd::generic_traits::memory_access_modes::erew_memory_access

      );


      __stdcall binary_fun_work_type(first_argument_type &&lhs, second_argument_type &&rhs, operation_type const &o, pool_type &pool) noexcept(false) FORCE_INLINE;


      void __fastcall process(result_type &r) FORCE_INLINE;


      bool __fastcall operator<(binary_fun_work_type const &rhs) const noexcept(true) FORCE_INLINE;


      template<class Arg1> constexpr bool __fastcall FORCE_INLINE

      operator<(Arg1 const &) const noexcept(true) {

         return true;

      }


   private:

      operation_type op;

      /// \todo This is done to prevent copying the execution contexts. If we have a transfer ctor, then we can avoid the copy. But we need to consider the fact that if the work has completed mutation, would this have a problem if the transfer is also occurring.

      shared_ptr_t arg_cxts;

   };


} } } }


#include "parallel_algorithms_impl.hpp"


#endif