libjmmcg  release_579_6_g8cffd
A C++ library containing an eclectic mix of useful, advanced components.
subdivide_n_gen_wk.hpp
Go to the documentation of this file.
1 #ifndef LIBJMMCG_CORE_PRIVATE_SUBDIVIDE_N_GEN_WK_HPP
2 #define LIBJMMCG_CORE_PRIVATE_SUBDIVIDE_N_GEN_WK_HPP
3 /******************************************************************************
4 ** Copyright © 2010 by J.M.McGuiness, coder@hussar.me.uk
5 **
6 ** This library is free software; you can redistribute it and/or
7 ** modify it under the terms of the GNU Lesser General Public
8 ** License as published by the Free Software Foundation; either
9 ** version 2.1 of the License, or (at your option) any later version.
10 **
11 ** This library is distributed in the hope that it will be useful,
12 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
13 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 ** Lesser General Public License for more details.
15 **
16 ** You should have received a copy of the GNU Lesser General Public
17 ** License along with this library; if not, write to the Free Software
18 ** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
20 
22 
23 namespace jmmcg { namespace LIBJMMCG_VER_NAMESPACE { namespace ppd { namespace private_ {
24 
25  template<class P, class Collns>
27  public:
28  typedef Collns containers_type;
29  typedef P pool_traits_type;
33  typedef typename os_traits::lock_traits::template atomic_counter_type<unsigned long> num_tasks_spawned_t;
34 
35  __stdcall unlock_collections(typename num_tasks_spawned_t::value_type const i, containers_type const &c) noexcept(true) FORCE_INLINE
36  : num_tasks_spawned(i), containers_(c) {
37  }
39 
40  containers_type const &__fastcall containers() const noexcept(true) FORCE_INLINE {
41  return containers_;
42  }
43  containers_type &__fastcall containers() noexcept(true) FORCE_INLINE {
44  return containers_;
45  }
46  void resize_output(typename containers_type::size_type const out_colln_size) noexcept(false) FORCE_INLINE {
47  containers_.resize_output(out_colln_size);
48  }
49  void __fastcall lock_containers() noexcept(true) FORCE_INLINE {
50  containers_.lock();
51  }
52 
53  void __fastcall add_a_task() noexcept(true) FORCE_INLINE {
54  ++num_tasks_spawned;
55  }
56 
57  virtual atomic_state_type __fastcall set() noexcept(true);
58 
59  private:
60  num_tasks_spawned_t num_tasks_spawned;
61  containers_type containers_;
62  };
63 
64  template<class P, class Collns>
65  class counted_event final : public P::async_thread_wk_elem_type::work_complete_t, public unlock_collections<P, Collns> {
66  public:
67  typedef unlock_collections<P, Collns> base_t;
69  typedef typename base_t::os_traits os_traits;
71  typedef typename base_t::timeout_type timeout_type;
75 
76  __stdcall counted_event(typename num_tasks_spawned_t::value_type const i, containers_type const &c) noexcept(true) FORCE_INLINE
77  : base_t(i, c), all_wk_done(os_traits::lock_traits::atom_unset) {
78  }
79 
80  atomic_state_type __fastcall set() noexcept(true) override FORCE_INLINE {
81  const atomic_state_type state=base_t::set();
82  if (state==pool_traits_type::os_traits::lock_traits::atom_unset) {
83  return state;
84  } else {
85  return all_wk_done.set();
86  }
87  }
88 
89  private:
90  all_wk_done_lk_t all_wk_done;
91 
92  atomic_state_type __fastcall try_lock() noexcept(true) override FORCE_INLINE {
93  return all_wk_done.try_lock();
94  }
95  atomic_state_type __fastcall lock() noexcept(false) override FORCE_INLINE {
96  return all_wk_done.lock();
97  }
98  atomic_state_type __fastcall lock(const timeout_type t) noexcept(false) override FORCE_INLINE {
99  return all_wk_done.lock(t);
100  }
101  atomic_state_type __fastcall unlock() noexcept(true) override FORCE_INLINE {
102  return all_wk_done.unlock();
103  }
104  atomic_state_type __fastcall reset() noexcept(true) override FORCE_INLINE {
105  return all_wk_done.reset();
106  }
107  };
108 
109  /**
110  Make sure that the work is marked as complete, even in the face of exceptions.
111  */
112  template<class WkC>
114  public:
115  typedef WkC work_complete_t;
116 
117  /**
118  To assist in allowing compile-time computation of the algorithmic order of the threading model.
119  */
121 
122  explicit __stdcall ensure_wk_complete(work_complete_t &w) noexcept(true) FORCE_INLINE
123  : all_done(w) {
124  }
126  __stdcall ~ensure_wk_complete() noexcept(true) FORCE_INLINE {
127  all_done.set();
128  }
129 
130  private:
131  work_complete_t &all_done;
132  };
133 
134  /// Assist with implementing the parallel versions of the standard algorithms.
135  /**
136  \see for_each
137  \see alg_work_wrap
138  */
139  template<class P, class Wk, generic_traits::return_data RD_>
140  class alg_wrapper1 : public Wk {
141  public:
142  typedef P pool_traits_type;
144  typedef Wk work_wrap;
145  typedef typename work_wrap::result_type result_type;
147  typedef counted_event<P, typename work_wrap::containers_type> work_complete_t;
149 
150  /**
151  To assist in allowing compile-time computation of the algorithmic order of the threading model.
152  */
159  );
160 
162  : work_wrap(std::forward<work_wrap>(wk)), all_done(w) {
163  all_done.add_a_task();
164  }
165 
166  void __fastcall process() noexcept(false) FORCE_INLINE {
167  const ensure_wk_complete_t e(all_done);
168  work_wrap::process();
169  }
170 
171  constexpr bool __fastcall operator<(alg_wrapper1 const &) const noexcept(true) FORCE_INLINE {
172  return true;
173  }
174 
175  private:
176  work_complete_t &all_done;
177  };
178  template<class P, class Wk>
180  public:
181  typedef P pool_traits_type;
183  typedef Wk work_wrap;
184  typedef typename work_wrap::result_type result_type;
188 
189  /**
190  To assist in allowing compile-time computation of the algorithmic order of the threading model.
191  */
198  );
199 
200  __stdcall alg_wrapper1(work_wrap &&wk, work_complete_t &w) noexcept(true) FORCE_INLINE
201  : work_wrap(std::forward<work_wrap>(wk)), all_done(w) {
202  all_done.add_a_task();
203  }
204 
205  void __fastcall process() noexcept(false) FORCE_INLINE {
206  const ensure_wk_complete_t e(all_done);
207  work_wrap::process();
208  }
209 
210  constexpr bool __fastcall operator<(alg_wrapper1 const &) const noexcept(true) FORCE_INLINE {
211  return true;
212  }
213 
214  private:
215  work_complete_t &all_done;
216  };
217 
218  /// Assist with implementing the parallel versions of the standard algorithms.
219  /**
220  \see transform
221  \see alg_work_wrap
222  */
223  template<class P, class Wk, generic_traits::return_data RD_>
224  class alg_wrapper2 : public Wk {
225  public:
226  typedef P pool_traits_type;
228  typedef Wk work_wrap;
229  typedef typename work_wrap::result_type result_type;
231  typedef counted_event<P, typename work_wrap::containers_type> work_complete_t;
233 
234  /**
235  To assist in allowing compile-time computation of the algorithmic order of the threading model.
236  */
243  );
244 
245  __stdcall alg_wrapper2(work_wrap &&wk, work_complete_t &w) noexcept(true) FORCE_INLINE
246  : work_wrap(std::forward<work_wrap>(wk)), all_done(w) {
247  all_done.add_a_task();
248  }
249 
250  void __fastcall process() noexcept(false) FORCE_INLINE {
251  const ensure_wk_complete_t e(all_done);
252  work_wrap::process();
253  }
254 
255  constexpr bool __fastcall operator<(alg_wrapper2 const &) const noexcept(true) FORCE_INLINE {
256  return true;
257  }
258 
259  private:
260  work_complete_t &all_done;
261  };
262  template<class P, class Wk>
264  public:
265  typedef P pool_traits_type;
267  typedef Wk work_wrap;
268  typedef typename work_wrap::result_type result_type;
272 
273  /**
274  To assist in allowing compile-time computation of the algorithmic order of the threading model.
275  */
282  );
283 
284  __stdcall alg_wrapper2(work_wrap &&wk, work_complete_t &w) noexcept(true) FORCE_INLINE
285  : work_wrap(std::forward<work_wrap>(wk)), all_done(w) {
286  all_done.add_a_task();
287  }
288 
289  void __fastcall process() noexcept(false) FORCE_INLINE {
290  const ensure_wk_complete_t e(all_done);
291  work_wrap::process();
292  }
293 
294  constexpr bool __fastcall operator<(alg_wrapper2 const &) const noexcept(true) FORCE_INLINE {
295  return true;
296  }
297 
298  private:
299  work_complete_t &all_done;
300  };
301 
302  /// Assist with implementing the parallel versions of the standard algorithms.
303  /**
304  \see transform
305  \see alg_work_wrap
306  */
307  template<class P, class Wk, generic_traits::return_data RD_>
308  class alg_wrapper3 : public Wk {
309  public:
310  typedef P pool_traits_type;
312  typedef Wk work_wrap;
313  typedef typename work_wrap::result_type result_type;
315  typedef counted_event<P, typename work_wrap::containers_type> work_complete_t;
317 
318  /**
319  To assist in allowing compile-time computation of the algorithmic order of the threading model.
320  */
327  );
328 
330  : work_wrap(std::forward<work_wrap>(wk)), all_done(w) {
331  all_done.add_a_task();
332  }
333 
334  void __fastcall process() noexcept(false) FORCE_INLINE {
335  const ensure_wk_complete_t e(all_done);
336  work_wrap::process();
337  }
338 
339  constexpr bool __fastcall operator<(alg_wrapper3 const &) const noexcept(true) FORCE_INLINE {
340  return true;
341  }
342 
343  private:
344  work_complete_t &all_done;
345  };
346  template<class P, class Wk>
348  public:
349  typedef P pool_traits_type;
351  typedef Wk work_wrap;
352  typedef typename work_wrap::result_type result_type;
356 
357  /**
358  To assist in allowing compile-time computation of the algorithmic order of the threading model.
359  */
366  );
367 
368  __stdcall alg_wrapper3(work_wrap &&wk, work_complete_t &w) noexcept(true) FORCE_INLINE
369  : work_wrap(std::forward<work_wrap>(wk)), all_done(w) {
370  all_done.add_a_task();
371  }
372 
373  void __fastcall process() noexcept(false) FORCE_INLINE {
374  const ensure_wk_complete_t e(all_done);
375  work_wrap::process();
376  }
377 
378  constexpr bool __fastcall operator<(alg_wrapper3 const &) const noexcept(true) FORCE_INLINE {
379  return true;
380  }
381 
382  private:
383  work_complete_t &all_done;
384  };
385 
386  /// Distribute the input range [begin, end) across the thread_pool_type recursively as a collection of tasks.
387  /**
388  This algorithm recursively creates tasks non-joinably until it terminates, when it reaches the leaves which contain contiguous sub-ranges [begin, end) of the initial range and the functor, fn. i.e. it distributes the initial range across the threads_per_clique within the thread_pool_type. The algorithm contains all_done, a counter, that records the number of outstanding tasks, and when that counter reaches zero, the execution_context is released, as all of the sub-tasks have completed, the counter is required because the tasks are transferred non-joinably.
389  */
390  template<
392  class TPB, ///< The thread_pool type.
393  class Alg ///< Housekeeping to ensure that the read_lock_type on the input- & [read_lock_type|write_lock_type] output-container_type is released once all work has been completed, also that once the work has been completed the execution_context is correctly signalled. Also includes the function to be applied to each element in the container_type in some unspecified order. Including the container_type on which to apply the function, of size n.
394  >
396  public:
397  typedef TPB thread_pool_type;
399  typedef void result_type;
400  typedef Alg alg_wrap_t;
402  typedef typename alg_wrap_t::os_traits os_traits;
405  /// An object that signals to the execution_context when all of the closure_base-derived closures has been process()ed, including being distributed across the threads_per_clique in the thread_pool via the tasks spawned by subdivide_n_gen_wk::process().
406  /**
407  This is used so that we don't have to generate an execution_context at each branch, and wait upon it, thus causing vertical pool_threads to be held, and horizontal_execution would have to occur instead, i.e. reducing resources consumed.
408  */
410  /**
411  This type is needed because closure::algo_thread_wk_buffered is dependent upon the exact type of subdivide_n_gen_wk, so is declared after it, but algo_thread_wk_buffered contains the custom memory-buffer. So we have to use a char * to the buffer, and the stride, rather than a more accurate type.
412  */
413  struct algo_work_heap_type;
414 
415  /**
416  To assist in allowing compile-time computation of the algorithmic order of the threading model.
417  */
418  static constexpr generic_traits::memory_access_modes memory_access_mode=alg_wrap_t::memory_access_mode;
419 
421 
422  static typename thread_pool_type::pool_type::size_type
423  compute_threads_per_clique(typename thread_pool_type::pool_type::size_type num_threads, typename thread_pool_type::pool_type::size_type const cliques) noexcept(true) FORCE_INLINE;
424 
425  /**
426  This computation is intimately related to the way subdivide_n_gen_wk::process() spawns sub-tasks, and the two must operate in a similar manner, otherwise we might get memory-allocation errors. Note that it over-allocates memory, because it doesn't allow for memory re-use: children could re-use memory of parents.
427 
428  \todo This is an O(n) operation, and we might want a faster algorithm, it doesn't have to be perfect, as long as the result is >= the true value.
429 
430  \return The number of items allocated in the tree that subdivide_n_gen_wk::process() will generate. Not in bytes, but items.
431 
432  \see subdivide_n_gen_wk::process()
433  */
434  static typename thread_pool_type::pool_type::size_type
435  compute_buffer_items(typename thread_pool_type::pool_type::size_type const num_threads_per_clique) noexcept(true) FORCE_INLINE;
436 
437  protected:
438  algo_work_heap_type const work_heap;
444 
446  compute_end(typename std::iterator_traits<in_iterator>::difference_type const number_subranges) const noexcept(true) FORCE_INLINE;
447  typename container_type::size_type
448  num_wk_items_spawned() const noexcept(true) FORCE_INLINE;
449 
450  /**
451  \return This is units of items, not bytes.
452  */
453  std::ptrdiff_t odd_third_buff_range() const noexcept(true) FORCE_INLINE;
454  /**
455  \return This is units of items, not bytes.
456  */
457  std::ptrdiff_t even_half_buff_range() const noexcept(true) FORCE_INLINE;
458  typename algo_work_heap_type::buffer_type first_buff_part() const noexcept(true) FORCE_INLINE;
459  typename algo_work_heap_type::buffer_type even_second_buff_part() const noexcept(true) FORCE_INLINE;
460  typename algo_work_heap_type::buffer_type odd_second_buff_part() const noexcept(true) FORCE_INLINE;
461  typename algo_work_heap_type::buffer_type odd_third_buff_part() const noexcept(true) FORCE_INLINE;
462 
463  __stdcall subdivide_n_gen_wk(
464  thread_pool_type &p,
465  operation_type &f,
466  typename alg_wrap_t::work_complete_t &w,
467  algo_work_heap_type const &wh) noexcept(true) FORCE_INLINE;
468 
469  /**
470  \param number_subranges Reduce the size of the range over which this algorithm operates by the amount specified by this number.
471  */
472  __stdcall subdivide_n_gen_wk(
473  thread_pool_type &p,
474  operation_type &f,
475  typename alg_wrap_t::work_complete_t &w,
476  algo_work_heap_type const &wh,
477  typename std::iterator_traits<in_iterator>::difference_type const number_subranges,
478  typename thread_pool_type::pool_type::size_type const cliques) noexcept(true) FORCE_INLINE;
479 
480  __stdcall subdivide_n_gen_wk(
481  thread_pool_type &p,
482  operation_type &f,
483  typename alg_wrap_t::work_complete_t &w,
484  algo_work_heap_type const &wh,
485  in_iterator const &b,
486  in_iterator const &e) noexcept(true) FORCE_INLINE;
487 
488  __stdcall subdivide_n_gen_wk(
489  thread_pool_type &p,
490  operation_type &f,
491  typename alg_wrap_t::work_complete_t &w,
492  algo_work_heap_type const &wh,
493  in_iterator const &b,
494  in_iterator const &e,
495  typename thread_pool_type::pool_type::size_type const t_per_c) noexcept(true) FORCE_INLINE;
496 
498  };
499  /// Distribute the input range [begin, end) across the thread_pool_type recursively as a collection of tasks.
500  template<
501  class TPB, ///< The thread_pool type.
502  class Alg ///< Housekeeping to ensure that the read_lock_type on the input- & [read_lock_type|write_lock_type] output-container_type is released once all work has been completed, also that once the work has been completed the execution_context is correctly signalled. Also includes the function to be applied to each element in the container_type in some unspecified order. Including the container_type on which to apply the function, of size n.
503  >
505  public:
506  typedef TPB thread_pool_type;
508  typedef void result_type;
509  typedef Alg alg_wrap_t;
511  typedef typename alg_wrap_t::os_traits os_traits;
514  /// An object that signals to the execution_context when all of the closure_base-derived closures has been process()ed, including being distributed across the threads_per_clique in the thread_pool via the tasks spawned by subdivide_n_gen_wk::process().
515  /**
516  This is used so that we don't have to generate an execution_context at each branch, and wait upon it, thus causing vertical pool_threads to be held, i.e. increasing resources consumed.
517  */
519  struct algo_work_heap_type;
520 
521  /**
522  To assist in allowing compile-time computation of the algorithmic order of the threading model.
523  */
524  static constexpr generic_traits::memory_access_modes memory_access_mode=alg_wrap_t::memory_access_mode;
525 
527 
528  static constexpr typename thread_pool_type::pool_type::size_type
529  compute_threads_per_clique(typename thread_pool_type::pool_type::size_type, typename thread_pool_type::pool_type::size_type const) noexcept(true) FORCE_INLINE;
530 
531  static constexpr typename thread_pool_type::pool_type::size_type
532  compute_buffer_items(typename thread_pool_type::pool_type::size_type const) noexcept(true) FORCE_INLINE;
533 
534  protected:
540 
541  __stdcall subdivide_n_gen_wk(
542  thread_pool_type &p,
543  operation_type &f,
544  typename alg_wrap_t::work_complete_t &w) noexcept(true) FORCE_INLINE;
545 
546  __stdcall subdivide_n_gen_wk(
547  thread_pool_type &p,
548  operation_type &f,
549  typename alg_wrap_t::work_complete_t &w,
550  in_iterator const &b,
551  in_iterator const &e) noexcept(true) FORCE_INLINE;
552 
554  };
555 
556  /**
557  This recursive process ensures that it takes O(log(n)) time to submit the work to the pool.
558 
559  Algorithm derived from [1].
560  Note that if the wrapped collection implements CREW or EREW semantics, as safe_colln does, then this algorithm is an implementation CREW/EREW P-RAM model, therefore if the thread_pool is large enough, it implements an optimal schedule according to section 3.3 & Theorem 3.3 in [1].
561 
562  [1] Alan Gibbons, Wojciech Rytter, "Efficient Parallel Algorithms", Cambridge University Press, 1989.
563 
564  \see safe_colln
565  */
566  template<
568  class TPB, ///< The thread_pool type.
569  class Fn, ///< The functor to be applied to each element in the collection in some unspecified order.
570  class Conts, ///< The collection on which to apply the function, of size n.
571  template<class, class> class Alg ///< The algorithm to apply.
572  >
574  Ps,
575  TPB,
576  alg_wrapper1<
577  typename TPB::pool_traits_type,
578  Alg<Conts, Fn>,
579  TPB::pool_traits_type::result_traits_
580  >
581  > {
582  public:
583  typedef subdivide_n_gen_wk<
584  Ps,
585  TPB,
586  alg_wrapper1<
587  typename TPB::pool_traits_type,
588  Alg<Conts, Fn>,
589  TPB::pool_traits_type::result_traits_
590  >
593  typedef typename base_t::in_iterator in_iterator;
595  typedef typename base_t::result_type result_type;
596  typedef typename base_t::alg_wrap_t alg_wrap_t;
599  typedef typename base_t::os_traits os_traits;
602  using base_t::compute_threads_per_clique;
603  using base_t::compute_buffer_items;
604  using base_t::memory_access_mode;
605 
606  private:
607  __stdcall subdivide_n_gen_wk1(
608  thread_pool_type &p,
609  operation_type &f,
610  typename alg_wrap_t::work_complete_t &w,
611  algo_work_heap_type const &wh,
612  in_iterator const &b,
613  in_iterator const &e,
614  typename thread_pool_type::pool_type::size_type const threads_per_clique) noexcept(true) FORCE_INLINE;
615 
616  public:
617  __stdcall subdivide_n_gen_wk1(thread_pool_type &p, operation_type &f, typename alg_wrap_t::work_complete_t &w, algo_work_heap_type const &wh, typename std::iterator_traits<in_iterator>::difference_type const number_subranges, typename thread_pool_type::pool_type::size_type const cliques) noexcept(true) FORCE_INLINE;
618 
619  /// Recursively call subdivide_n_gen_wk1::process(), on disjoint left and right-subsets (assuming even numbers of processors in the clique) of the input collection, until the number of work items generated is 2^n just larger than the number of threads in the pool, which implements a form of GSS(k) scheduling.
620  /**
621  As the subsets are disjoint inter-subset operations are effectively CRCW operations, whereas intra-subset operations are strictly EREW. This subdivision is valid according to Proposition 1.1 in section 1.2 and Brent's Theorem [1].
622 
623  [1] Casanova, H., Legrand, A., Robert, Y., "Parallel Algorithms", CRC Press, 2008.
624 
625  \see nonjoinable, nonjoinable_buff
626  */
627  void __fastcall
628  process() noexcept(false);
629 
630  constexpr bool __fastcall operator<(subdivide_n_gen_wk1 const &) const noexcept(true) FORCE_INLINE {
631  return true;
632  }
633  };
634  template<
635  class TPB, ///< The thread_pool type.
636  class Fn, ///< The functor to be applied to each element in the collection in some unspecified order.
637  class Conts, ///< The collection on which to apply the function, of size n.
638  template<class, class> class Alg ///< The algorithm to apply.
639  >
642  TPB,
643  alg_wrapper1<
644  typename TPB::pool_traits_type,
645  Alg<Conts, Fn>,
647  >
648  > {
649  public:
650  typedef subdivide_n_gen_wk<
652  TPB,
653  alg_wrapper1<
654  typename TPB::pool_traits_type,
655  Alg<Conts, Fn>,
657  >
660  typedef typename base_t::in_iterator in_iterator;
662  typedef typename base_t::result_type result_type;
663  typedef typename base_t::alg_wrap_t alg_wrap_t;
666  typedef typename base_t::os_traits os_traits;
671  using base_t::memory_access_mode;
672 
673  private:
674  __stdcall subdivide_n_gen_wk1(
676  operation_type &f,
677  typename alg_wrap_t::work_complete_t &w,
678  in_iterator const &b,
679  in_iterator const &e) noexcept(true) FORCE_INLINE;
680 
681  public:
683 
684  /// Recursively call subdivide_n_gen_wk1::process(), on disjoint left and right-subsets (assuming even numbers of processors in the clique) of the input collection, until the number of work items generated is 2^n just larger than the number of threads in the pool, which implements a form of GSS(k) scheduling.
685  /**
686  As the subsets are disjoint inter-subset operations are effectively CRCW operations, whereas intra-subset operations are strictly EREW. This subdivision is valid according to Proposition 1.1 in section 1.2 and Brent's Theorem [1].
687 
688  [1] Casanova, H., Legrand, A., Robert, Y., "Parallel Algorithms", CRC Press, 2008.
689 
690  \see nonjoinable_buff
691  */
692  void __fastcall
693  process() noexcept(false);
694 
695  constexpr bool __fastcall operator<(subdivide_n_gen_wk1 const &) const noexcept(true) FORCE_INLINE {
696  return true;
697  }
698  };
699 
700  /**
701  This recursive process ensures that it takes O(log(n)) time to submit the work to the pool.
702 
703  Algorithm derived from section 3.3 & Theorem 3.3 in [1].
704 
705  Note that if the wrapped collection implements CREW or EREW semantics, as safe_colln does, then this algorithm is an implementation CREW/EREW P-RAM model, therefore if the thread_pool is large enough, it implements an optimal schedule according to [1].
706 
707  [1] Alan Gibbons, Wojciech Rytter, "Efficient Parallel Algorithms", Cambridge University Press, 1989.
708 
709  \see safe_colln
710  */
711  template<
713  class TPB, ///< The thread_pool type.
714  class UniOp, ///< The unary operation to be applied to each element in the input collection in some unspecified order.
715  class Conts, ///< The collections on which to apply the function, of size n.
716  template<class, class> class Alg ///< The algorithm to apply.
717  >
719  Ps,
720  TPB,
721  alg_wrapper2<
722  typename TPB::pool_traits_type,
723  Alg<Conts, UniOp>,
724  TPB::pool_traits_type::result_traits_
725  >
726  > {
727  public:
728  typedef subdivide_n_gen_wk<
729  Ps,
730  TPB,
731  alg_wrapper2<
732  typename TPB::pool_traits_type,
733  Alg<Conts, UniOp>,
734  TPB::pool_traits_type::result_traits_
735  >
738  typedef typename base_t::in_iterator in_iterator;
740  typedef typename base_t::result_type result_type;
741  typedef typename base_t::alg_wrap_t alg_wrap_t;
745  typedef typename base_t::os_traits os_traits;
748  using base_t::compute_threads_per_clique;
749  using base_t::compute_buffer_items;
750  using base_t::memory_access_mode;
751 
752  private:
753  __stdcall subdivide_n_gen_wk2(
754  thread_pool_type &p,
755  operation_type &o,
756  typename alg_wrap_t::work_complete_t &w,
757  algo_work_heap_type const &wh,
758  in_iterator const &ib,
759  in_iterator const &ie,
760  out_iterator const &ob,
761  out_iterator const &oe,
762  const typename thread_pool_type::pool_type::size_type threads_per_clique) noexcept(true) FORCE_INLINE;
763 
764  public:
765  __stdcall subdivide_n_gen_wk2(thread_pool_type &p, operation_type &o, typename alg_wrap_t::work_complete_t &w, algo_work_heap_type const &wh, typename std::iterator_traits<in_iterator>::difference_type const number_subranges, typename thread_pool_type::pool_type::size_type const cliques) noexcept(true) FORCE_INLINE;
766 
767  /// Recursively call subdivide_n_gen_wk2::process(), on disjoint left and right-subsets (assuming even numbers of processors in the clique) of the input collection, until the number of work items generated is 2^n just larger than the number of threads in the pool, which implements a form of GSS(k) scheduling.
768  /**
769  As the subsets are disjoint inter-subset operations are effectively CRCW operations, whereas intra-subset operations are strictly EREW. This subdivision is valid according to Proposition 1.1 in section 1.2 and Brent's Theorem [1].
770 
771  [1] Casanova, H., Legrand, A., Robert, Y., "Parallel Algorithms", CRC Press, 2008.
772 
773  \see nonjoinable_buff
774  */
775  void __fastcall
776  process() noexcept(false);
777 
778  constexpr bool __fastcall operator<(subdivide_n_gen_wk2 const &) const noexcept(true) FORCE_INLINE {
779  return true;
780  }
781 
782  private:
783  out_iterator out_begin;
784  out_iterator const out_end;
785  };
786  template<
787  class TPB, ///< The thread_pool type.
788  class UniOp, ///< The unary operation to be applied to each element in the input collection in some unspecified order.
789  class Conts, ///< The collections on which to apply the function, of size n.
790  template<class, class> class Alg ///< The algorithm to apply.
791  >
794  TPB,
795  alg_wrapper2<
796  typename TPB::pool_traits_type,
797  Alg<Conts, UniOp>,
799  >
800  > {
801  public:
802  typedef subdivide_n_gen_wk<
804  TPB,
805  alg_wrapper2<
806  typename TPB::pool_traits_type,
807  Alg<Conts, UniOp>,
809  >
812  typedef typename base_t::in_iterator in_iterator;
814  typedef typename base_t::result_type result_type;
815  typedef typename base_t::alg_wrap_t alg_wrap_t;
819  typedef typename base_t::os_traits os_traits;
824  using base_t::memory_access_mode;
825 
826  private:
827  __stdcall subdivide_n_gen_wk2(
829  operation_type &o,
830  typename alg_wrap_t::work_complete_t &w,
831  in_iterator const &ib,
832  in_iterator const &ie,
833  out_iterator const &ob,
834  out_iterator const &oe) noexcept(true) FORCE_INLINE;
835 
836  public:
838 
839  /// Recursively call subdivide_n_gen_wk2::process(), on disjoint left and right-subsets (assuming even numbers of processors in the clique) of the input collection, until the number of work items generated is 2^n just larger than the number of threads in the pool, which implements a form of GSS(k) scheduling.
840  /**
841  As the subsets are disjoint inter-subset operations are effectively CRCW operations, whereas intra-subset operations are strictly EREW. This subdivision is valid according to Proposition 1.1 in section 1.2 and Brent's Theorem [1].
842 
843  [1] Casanova, H., Legrand, A., Robert, Y., "Parallel Algorithms", CRC Press, 2008.
844 
845  \see nonjoinable_buff
846  */
847  void __fastcall
848  process() noexcept(false);
849 
850  constexpr bool __fastcall operator<(subdivide_n_gen_wk2 const &) const noexcept(true) FORCE_INLINE {
851  return true;
852  }
853 
854  private:
856  out_iterator const out_end;
857  };
858 
859  /**
860  This recursive process ensures that it takes O(log(n)) time to submit the work to the pool.
861 
862  Algorithm derived from [1].
863 
864  Note that if the wrapped collection implements CREW or EREW semantics, as safe_colln does, then this algorithm is an implementation CREW/EREW P-RAM model, therefore if the thread_pool is large enough, it implements an optimal schedule according to section 3.3 & Theorem 3.3 in [1].
865 
866  [1] Alan Gibbons, Wojciech Rytter, "Efficient Parallel Algorithms", Cambridge University Press, 1989.
867 
868  \see safe_colln
869  */
870  template<
872  class TPB, ///< The thread_pool type.
873  class BinOp, ///< The binary operation to be applied to each element in the input collection in some unspecified order.
874  class Conts, ///< The collections on which to apply the function, of size n.
875  template<class, class> class Alg ///< The algorithm to apply.
876  >
878  Ps,
879  TPB,
880  alg_wrapper3<
881  typename TPB::pool_traits_type,
882  Alg<Conts, BinOp>,
883  TPB::pool_traits_type::result_traits_
884  >
885  > {
886  public:
887  typedef subdivide_n_gen_wk<
888  Ps,
889  TPB,
890  alg_wrapper3<
891  typename TPB::pool_traits_type,
892  Alg<Conts, BinOp>,
893  TPB::pool_traits_type::result_traits_
894  >
897  typedef typename base_t::in_iterator in_iterator;
899  typedef typename base_t::result_type result_type;
900  typedef typename base_t::alg_wrap_t alg_wrap_t;
905  typedef typename base_t::os_traits os_traits;
908  using base_t::compute_threads_per_clique;
909  using base_t::compute_buffer_items;
910  using base_t::memory_access_mode;
911 
912  private:
913  __stdcall subdivide_n_gen_wk3(
914  thread_pool_type &p,
915  operation_type &o,
916  typename alg_wrap_t::work_complete_t &w,
917  algo_work_heap_type const &wh,
918  in_iterator const &ib1,
919  in_iterator const &ie1,
920  in2_iterator const &ib2,
921  in2_iterator const &ie2,
922  out_iterator const &ob,
923  out_iterator const &oe,
924  typename thread_pool_type::pool_type::size_type const threads_per_clique) noexcept(true) FORCE_INLINE;
925 
926  public:
927  __stdcall subdivide_n_gen_wk3(thread_pool_type &p, operation_type &o, typename alg_wrap_t::work_complete_t &w, algo_work_heap_type const &wh, typename std::iterator_traits<in_iterator>::difference_type const number_subranges, typename thread_pool_type::pool_type::size_type const cliques) noexcept(true) FORCE_INLINE;
928 
929  /// Recursively call subdivide_n_gen_wk3::process(), on disjoint left and right-subsets (assuming even numbers of processors in the clique) of the input collection, until the number of work items generated is 2^n just larger than the number of threads in the pool, which implements a form of GSS(k) scheduling.
930  /**
931  As the subsets are disjoint inter-subset operations are effectively CRCW operations, whereas intra-subset operations are strictly EREW. This subdivision is valid according to Proposition 1.1 in section 1.2 and Brent's Theorem [1].
932 
933  [1] Casanova, H., Legrand, A., Robert, Y., "Parallel Algorithms", CRC Press, 2008.
934 
935  \see nonjoinable_buff
936  */
937  void __fastcall
938  process() noexcept(false);
939 
940  constexpr bool __fastcall operator<(subdivide_n_gen_wk3 const &) const noexcept(true) FORCE_INLINE {
941  return true;
942  }
943 
944  private:
945  in2_iterator in_begin2;
946  in2_iterator const in_end2;
947  out_iterator out_begin;
948  out_iterator const out_end;
949  };
950  template<
951  class TPB, ///< The thread_pool type.
952  class BinOp, ///< The binary operation to be applied to each element in the input collection in some unspecified order.
953  class Conts, ///< The collections on which to apply the function, of size n.
954  template<class, class> class Alg ///< The algorithm to apply.
955  >
958  TPB,
959  alg_wrapper3<
960  typename TPB::pool_traits_type,
961  Alg<Conts, BinOp>,
963  >
964  > {
965  public:
966  typedef subdivide_n_gen_wk<
968  TPB,
969  alg_wrapper3<
970  typename TPB::pool_traits_type,
971  Alg<Conts, BinOp>,
973  >
976  typedef typename base_t::in_iterator in_iterator;
978  typedef typename base_t::result_type result_type;
979  typedef typename base_t::alg_wrap_t alg_wrap_t;
984  typedef typename base_t::os_traits os_traits;
989  using base_t::memory_access_mode;
990 
991  private:
992  __stdcall subdivide_n_gen_wk3(
994  operation_type &o,
995  typename alg_wrap_t::work_complete_t &w,
996  in_iterator const &ib1,
997  in_iterator const &ie1,
998  in2_iterator const &ib2,
999  in2_iterator const &ie2,
1000  out_iterator const &ob,
1001  out_iterator const &oe) noexcept(true) FORCE_INLINE;
1002 
1003  public:
1005 
1006  /// Recursively call subdivide_n_gen_wk3::process(), on disjoint left and right-subsets (assuming even numbers of processors in the clique) of the input collection, until the number of work items generated is 2^n just larger than the number of threads in the pool, which implements a form of GSS(k) scheduling.
1007  /**
1008  As the subsets are disjoint inter-subset operations are effectively CRCW operations, whereas intra-subset operations are strictly EREW. This subdivision is valid according to Proposition 1.1 in section 1.2 and Brent's Theorem [1].
1009 
1010  [1] Casanova, H., Legrand, A., Robert, Y., "Parallel Algorithms", CRC Press, 2008.
1011 
1012  \see nonjoinable_buff
1013  */
1014  void __fastcall
1015  process() noexcept(false);
1016 
1017  constexpr bool __fastcall operator<(subdivide_n_gen_wk3 const &) const noexcept(true) FORCE_INLINE {
1018  return true;
1019  }
1020 
1021  private:
1023  in2_iterator const in_end2;
1025  out_iterator const out_end;
1026  };
1027 
1028 } } } }
1029 
1031 
1032 #endif