libjmmcg  release_579_6_g8cffd
A C++ library containing an eclectic mix of useful, advanced components.
memops_impl.hpp
Go to the documentation of this file.
1 /******************************************************************************
2 ** Copyright © 2013 by J.M.McGuiness, coder@hussar.me.uk
3 **
4 ** This library is free software; you can redistribute it and/or
5 ** modify it under the terms of the GNU Lesser General Public
6 ** License as published by the Free Software Foundation; either
7 ** version 2.1 of the License, or (at your option) any later version.
8 **
9 ** This library is distributed in the hope that it will be useful,
10 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
11 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 ** Lesser General Public License for more details.
13 **
14 ** You should have received a copy of the GNU Lesser General Public
15 ** License along with this library; if not, write to the Free Software
16 ** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18 
19 namespace jmmcg { namespace LIBJMMCG_VER_NAMESPACE {
20 
21 namespace private_ {
22 
23  template<
24  unsigned long long N ///< The number of times the operation should be applied.
25  >
26  struct unroll {
27  /// Apply the specified operation the specified number of times, sequentially.
28  /**
29  \param o The operation to apply.
30  */
31  template<class Op>
32  static constexpr void FORCE_INLINE
33  result(Op &&o) noexcept(noexcept(o.operator()(N-1))) {
34  o.operator()(N-1);
35  unroll<N-1>::result(std::forward<Op>(o));
36  }
37  template<class Op>
38  static constexpr bool FORCE_INLINE
39  result_b(Op &&o) noexcept(noexcept(o.operator()(N-1))) {
40  const bool res=o.operator()(N-1);
41  return res ? unroll<N-1>::result_b(std::forward<Op>(o)) : false;
42  }
43  };
44  template<>
45  struct unroll<0ull> {
46  template<class Op>
47  static constexpr void FORCE_INLINE
48  result(Op const &) noexcept(true) {
49  }
50  template<class Op>
51  static constexpr bool FORCE_INLINE
52  result_b(Op const &) noexcept(true) {
53  return true;
54  }
55  };
56 
57  namespace bits {
58 
59  template<class T> inline constexpr
60  T clear_leftmost_set(const T value) noexcept(true) {
61  assert(value != 0);
62  return value & (value - 1);
63  }
64 
65  template<class T> inline constexpr
66  unsigned count_trailing_zeroes(const T value) noexcept(true)=delete;
67  template<> inline
68  unsigned count_trailing_zeroes<std::uint32_t>(const std::uint32_t value) noexcept(true) {
69  assert(value != 0);
70  return __builtin_ctz(value);
71  }
72  template<> inline
73  unsigned count_trailing_zeroes<std::uint64_t>(const std::uint64_t value) noexcept(true) {
74  assert(value!=0);
75  return __builtin_ctzl(value);
76  }
77  template<> inline
78  unsigned count_trailing_zeroes<uint128_t>(const uint128_t value) noexcept(true) {
79  assert(value!=0);
80  return count_trailing_zeroes_compat(value);
81  }
82 
83  template<class T> inline constexpr
84  unsigned count_leading_zeroes(const T value) noexcept(true)=delete;
85  template<> inline
86  unsigned count_leading_zeroes<std::uint32_t>(const std::uint32_t value) noexcept(true) {
87  assert(value!=0);
88  return __builtin_clz(value);
89  }
90  template<> inline
91  unsigned count_leading_zeroes<std::uint64_t>(const std::uint64_t value) noexcept(true) {
92  assert(value!=0);
93  return __builtin_clzl(value);
94  }
95  template<> inline
96  unsigned count_leading_zeroes<uint128_t>(const uint128_t value) noexcept(true) {
97  assert(value!=0);
98  return count_leading_zeroes_compat(value);
99  }
100 
101  }
102 
103  template<
104  std::size_t SrcSz,
105  std::size_t DestSz,
106  class Unit,
107  std::size_t SmallestBuff=min<std::size_t, SrcSz, DestSz>::value,
108  std::size_t Div=SmallestBuff/sizeof(Unit),
109  std::size_t Rem=SmallestBuff%sizeof(Unit)
110  >
112  using element_type=Unit;
113  enum : std::size_t {
114  src_sz=SrcSz,
115  dest_sz=DestSz,
116  smallest_buff=SmallestBuff,
117  div=Div, ///< How many whole Units can be copied.
118  rem=Rem,
119  end=smallest_buff-rem ///< How much of the buffer was copied.
120  };
121  using unrolled_op_t=private_::unroll<div>;
122 
123  static_assert(src_sz>0 && dest_sz>0, "Buffers must be non-zero.");
124  BOOST_MPL_ASSERT_RELATION(smallest_buff, >, 0);
125  BOOST_MPL_ASSERT_RELATION(sizeof(Unit), >, 0);
126  BOOST_MPL_ASSERT_RELATION(div, >=, 0);
127  BOOST_MPL_ASSERT_RELATION(rem, >=, 0);
128  BOOST_MPL_ASSERT_RELATION(end, >=, 0);
129  static_assert(smallest_buff<=src_sz && smallest_buff<=dest_sz, "TODO.");
130  BOOST_MPL_ASSERT_RELATION(rem, <=, sizeof(Unit));
131  BOOST_MPL_ASSERT_RELATION((sizeof(Unit)*div), <=, smallest_buff);
132  BOOST_MPL_ASSERT_RELATION(rem, <=, smallest_buff);
133  BOOST_MPL_ASSERT_RELATION(end, <=, smallest_buff);
134  static_assert((div==0 && rem<=sizeof(Unit)) || (div>0 && end<=smallest_buff), "TODO.");
135 
136  template<class Op>
137  static constexpr void FORCE_INLINE
138  result(Op &&o) noexcept(noexcept(unrolled_op_t::result(std::forward<Op>(o)))) {
139  // Unroll the copy in the hope that the compiler will notice the sequence of copies and optimize it.
140  unrolled_op_t::result(std::forward<Op>(o));
141  }
142  template<class Op>
143  static constexpr bool FORCE_INLINE
144  result_b(Op &&o) noexcept(noexcept(unrolled_op_t::result(std::forward<Op>(o)))) {
145  // Unroll the copy in the hope that the compiler will notice the sequence of copies and optimize it.
146  return unrolled_op_t::result_b(std::forward<Op>(o));
147  }
148  };
149  /**
150  The buffer is not in units of Unit, so no-op.
151  */
152  template<
153  std::size_t SrcSz,
154  std::size_t DestSz,
155  class Unit,
156  std::size_t SmallestBuff,
157  std::size_t Rem
158  >
159  struct aligned_unroller<SrcSz, DestSz, Unit, SmallestBuff, 0, Rem> {
160  using element_type=Unit;
161  enum : std::size_t {
162  src_sz=SrcSz,
163  dest_sz=DestSz,
164  smallest_buff=SmallestBuff,
165  div=std::size_t(),
166  rem=Rem,
167  end=std::size_t()
168  };
169 
170  template<class Op>
171  static constexpr void FORCE_INLINE
172  result(Op const &) noexcept(true) {
173  }
174  template<class Op>
175  static constexpr bool FORCE_INLINE
176  result_b(Op const &) noexcept(true) {
177  return true;
178  }
179  };
180 
184  struct char_sized {};
185 
186  template<std::size_t FirstSz>
187  class select_size {
188  private:
189  enum : std::size_t {
190  avx512_sz=
191 #ifdef __AVX512__
192  sizeof(__m512i),
193 #elif defined(__AVX__)
194  sizeof(__m256i),
195 #elif defined(__SSE2__)
196  sizeof(__m128i),
197 #else
198  1,
199 #endif
200  avx_sz=
201 #ifdef __AVX__
202  sizeof(__m256i),
203 #elif defined(__SSE2__)
204  sizeof(__m128i),
205 #else
206  1,
207 #endif
208  sse2_sz=
209 #ifdef __SSE2__
210  sizeof(__m128i),
211 #else
212  1,
213 #endif
214  };
215  using sz512=
216 #ifdef __AVX512__
217  greater_than_eq_512;
218 #elif defined(__AVX__)
219  greater_than_eq_256;
220 #elif defined(__SSE2__)
222 #else
223  char_sized;
224 #endif
225  using sz256=
226 #ifdef __AVX__
227  greater_than_eq_256;
228 #elif defined(__SSE2__)
230 #else
231  char_sized;
232 #endif
233  using sz128=
234 #ifdef __SSE2__
236 #else
237  char_sized;
238 #endif
239 
240  public:
241  using type=typename std::conditional<
242  FirstSz>=avx512_sz,
243  sz512,
244  typename std::conditional<
245  FirstSz>=avx_sz,
246  sz256,
247  typename std::conditional<
248  FirstSz>=sse2_sz,
249  sz128,
250  char_sized
251  >::type
252  >::type
253  >::type;
254  };
255 
256  template<
257  char const needle,
258  std::size_t FirstSz,
259  class Sz
260  >
261  struct strchr_opt;
262 #ifdef __AVX512__
263  template<
264  char const needle,
266  >
268  static constexpr char const * FORCE_INLINE __attribute__((pure))
269  result(char const * const haystack) noexcept(true) {
270  using element_type=__m512i;
271  const auto strchr_opt_int=[&haystack]() __attribute__((pure)) -> char const * {
273  const element_type block_first=_mm512_loadu_si512(reinterpret_cast<element_type const *>(haystack));
276  if (mask!=0) {
278  return haystack+bitpos;
279  } else {
280  return nullptr;
281  }
282  };
283 
284  const auto ret=strchr_opt_int();
285  if (ret) {
286  return ret;
287  } else {
288  constexpr const std::size_t next_portion=((FirstSz>=sizeof(element_type)) ? (FirstSz-sizeof(element_type)) : FirstSz);
289  if (next_portion) {
290  return strchr_opt<
291  needle,
292  next_portion,
293  typename select_size<next_portion>::type
294  >::result(haystack+sizeof(element_type));
295  } else {
296  return nullptr;
297  }
298  }
299  }
300  };
301 #endif
302 #ifdef __AVX__
303  template<
304  char const needle,
306  >
308  static constexpr char const * FORCE_INLINE __attribute__((pure))
309  result(char const * const haystack) noexcept(true) {
310  using element_type=__m256i;
311  const auto strchr_opt_int=[&haystack]() -> char const * __attribute__((no_sanitize("address"))) __attribute__((pure)) {
313  const element_type block_first=_mm256_loadu_si256(reinterpret_cast<element_type const *>(haystack));
316  if (mask!=0) {
318  return haystack+bitpos;
319  } else {
320  return nullptr;
321  }
322  };
323 
324  const auto ret=strchr_opt_int();
325  if (ret) {
326  return ret;
327  } else {
328  constexpr const std::size_t next_portion=((FirstSz>=sizeof(element_type)) ? (FirstSz-sizeof(element_type)) : FirstSz);
329  if (next_portion) {
330  return strchr_opt<
331  needle,
332  next_portion,
333  typename select_size<next_portion>::type
334  >::result(haystack+sizeof(element_type));
335  } else {
336  return nullptr;
337  }
338  }
339  }
340  };
341 #endif
342 #ifdef __SSE2__
343  template<
344  char const needle,
345  std::size_t FirstSz
346  >
347  struct strchr_opt<needle, FirstSz, greater_than_eq_128> {
348  static constexpr char const * FORCE_INLINE __attribute__((pure))
349  result(char const * const haystack) noexcept(true) {
350  using element_type=__m128i;
351  const auto strchr_opt_int=[&haystack]() -> char const * __attribute__((no_sanitize("address"))) __attribute__((pure)) {
352  const element_type conv=_mm_set1_epi8(needle);
353  const element_type block_first=_mm_loadu_si128(reinterpret_cast<element_type const *>(haystack));
354  const element_type eq_needle=_mm_cmpeq_epi8(conv, block_first);
355  const std::uint32_t mask=_mm_movemask_epi8(eq_needle);
356  if (mask!=0) {
357  const auto bitpos=private_::bits::count_trailing_zeroes(mask);
358  return haystack+bitpos;
359  } else {
360  return nullptr;
361  }
362  };
363 
364  const auto ret=strchr_opt_int();
365  if (ret) {
366  return ret;
367  } else {
368  constexpr const std::size_t next_portion=((FirstSz>=sizeof(element_type)) ? (FirstSz-sizeof(element_type)) : FirstSz);
369  if (next_portion) {
370  return strchr_opt<
371  needle,
372  next_portion,
373  typename select_size<next_portion>::type
374  >::result(haystack+sizeof(element_type));
375  } else {
376  return nullptr;
377  }
378  }
379  }
380  };
381 #endif
382  template<
383  char const needle,
384  std::size_t FirstSz
385  >
386  struct strchr_opt<needle, FirstSz, char_sized> {
387  static constexpr char const * FORCE_INLINE
388  result(char const * const haystack) noexcept(true) {
389  return std::strchr(haystack, needle);
390  }
391  };
392 
393  template<
394  std::size_t FirstSz,
395  std::size_t SecondSz,
396  class Sz
397  >
398  struct strstr_opt;
399 #ifdef __AVX512__
400  template<
401  std::size_t FirstSz,
403  >
405  static constexpr char const * FORCE_INLINE __attribute__((pure))
406  result(char const * const haystack, char const (&needle)[SecondSz]) noexcept(true) {
407  using element_type=__m512i;
408  const auto strstr_opt_int=[&haystack, &needle]() __attribute__((pure)) -> char const * {
411  const element_type block_first=_mm512_loadu_si512(reinterpret_cast<element_type const *>(haystack));
412  const element_type block_last=_mm512_loadu_si512(reinterpret_cast<element_type const *>(haystack+SecondSz-1));
416  while (mask!=0) {
417  if constexpr ((SecondSz-2)>0) {
419  if ((bitpos+1+SecondSz-2)<=FirstSz) {
420  using needle_cmp_t=char const [SecondSz-2];
421  if (memcmp_opt(reinterpret_cast<needle_cmp_t &>(haystack[bitpos+1]), reinterpret_cast<needle_cmp_t &>(needle[1]))) {
422  return haystack+bitpos;
423  }
425  } else {
426  return nullptr;
427  }
428  } else {
429  return haystack;
430  }
431  }
432  return nullptr;
433  };
434 
435  const auto ret=strstr_opt_int();
436  if (ret) {
437  return ret;
438  } else {
439  constexpr const std::size_t next_portion=((FirstSz>=sizeof(element_type)) ? (FirstSz-sizeof(element_type)) : FirstSz);
440  if (next_portion) {
441  return strstr_opt<
442  next_portion,
443  SecondSz,
444  typename select_size<next_portion>::type
445  >::result(haystack+sizeof(element_type), needle);
446  } else {
447  return nullptr;
448  }
449  }
450  return nullptr;
451  }
452  };
453 #endif
454 #ifdef __AVX__
455  template<
456  std::size_t FirstSz,
458  >
460  static constexpr char const * FORCE_INLINE __attribute__((pure))
461  result(char const * const haystack, char const (&needle)[SecondSz]) noexcept(true) {
462  using element_type=__m256i;
463  const auto strstr_opt_int=[&haystack, &needle]() -> char const * __attribute__((no_sanitize("address"))) __attribute__((pure)) {
466  const element_type block_first=_mm256_loadu_si256(reinterpret_cast<element_type const *>(haystack));
467  const element_type block_last=_mm256_loadu_si256(reinterpret_cast<element_type const *>(haystack+SecondSz-1));
471  while (mask!=0) {
472  if constexpr ((SecondSz-2)>0) {
474  if ((bitpos+1+SecondSz-2)<=FirstSz) {
475  using needle_cmp_t=char const [SecondSz-2];
476  if (memcmp_opt(reinterpret_cast<needle_cmp_t &>(haystack[bitpos+1]), reinterpret_cast<needle_cmp_t &>(needle[1]))) {
477  return haystack+bitpos;
478  }
480  } else {
481  return nullptr;
482  }
483  } else {
484  return haystack;
485  }
486  }
487  return nullptr;
488  };
489 
490  const auto ret=strstr_opt_int();
491  if (ret) {
492  return ret;
493  } else {
494  constexpr const std::size_t next_portion=((FirstSz>=sizeof(element_type)) ? (FirstSz-sizeof(element_type)) : FirstSz);
495  if (next_portion) {
496  return strstr_opt<
497  next_portion,
498  SecondSz,
499  typename select_size<next_portion>::type
500  >::result(haystack+sizeof(element_type), needle);
501  } else {
502  return nullptr;
503  }
504  }
505  return nullptr;
506  }
507  };
508 #endif
509 #ifdef __SSE2__
510  template<
511  std::size_t FirstSz,
512  std::size_t SecondSz
513  >
514  struct strstr_opt<FirstSz, SecondSz, greater_than_eq_128> {
515  static constexpr char const * FORCE_INLINE __attribute__((pure))
516  result(char const * const haystack, char const (&needle)[SecondSz]) noexcept(true) {
517  using element_type=__m128i;
518  const auto strstr_opt_int=[&haystack, &needle]() -> char const * __attribute__((no_sanitize("address"))) __attribute__((pure)) {
519  const element_type first=_mm_set1_epi8(needle[0]);
520  const element_type last=_mm_set1_epi8(needle[SecondSz-1]);
521  const element_type block_first=_mm_loadu_si128(reinterpret_cast<element_type const *>(haystack));
522  const element_type block_last=_mm_loadu_si128(reinterpret_cast<element_type const *>(haystack+SecondSz-1));
523  const element_type eq_first=_mm_cmpeq_epi8(first, block_first);
524  const element_type eq_last=_mm_cmpeq_epi8(last, block_last);
525  std::uint32_t mask=_mm_movemask_epi8(_mm_or_si128(eq_first, eq_last));
526  while (mask!=0) {
527  if constexpr ((SecondSz-2)>0) {
528  const auto bitpos=private_::bits::count_trailing_zeroes(mask);
529  if ((bitpos+1+SecondSz-2)<=FirstSz) {
530  using needle_cmp_t=char const [SecondSz-2];
531  if (memcmp_opt(reinterpret_cast<needle_cmp_t &>(haystack[bitpos+1]), reinterpret_cast<needle_cmp_t &>(needle[1]))) {
532  return haystack+bitpos;
533  }
534  mask=private_::bits::clear_leftmost_set(mask);
535  } else {
536  return nullptr;
537  }
538  } else {
539  return haystack;
540  }
541  }
542  return nullptr;
543  };
544 
545  const auto ret=strstr_opt_int();
546  if (ret) {
547  return ret;
548  } else {
549  constexpr const std::size_t next_portion=((FirstSz>=sizeof(element_type)) ? (FirstSz-sizeof(element_type)) : FirstSz);
550  if (next_portion) {
551  return strstr_opt<
552  next_portion,
553  SecondSz,
554  typename select_size<next_portion>::type
555  >::result(haystack+sizeof(element_type), needle);
556  } else {
557  return nullptr;
558  }
559  }
560  return nullptr;
561  }
562  };
563 #endif
564  template<
565  std::size_t FirstSz,
566  std::size_t SecondSz
567  >
568  struct strstr_opt<FirstSz, SecondSz, char_sized> {
569  static constexpr char const * FORCE_INLINE
570  result(char const * const haystack, char const (&needle)[SecondSz]) noexcept(true) {
571  return std::strstr(haystack, needle);
572  }
573  };
574 
575 }
576 
577 template<class Iter1, class Iter2> inline void FORCE_INLINE
578 memcpy(Iter1 dest, Iter2 src, std::size_t n) noexcept(true) {
579  std::uninitialized_copy(src, src+n, dest);
580 }
581 
582 template<> inline void FORCE_INLINE
583 memcpy<char *, char const *>(char *dest, char const *src, std::size_t n) noexcept(true) {
584  std::memcpy(dest, src, n);
585 }
586 
587 template<> inline void FORCE_INLINE
588 memcpy<wchar_t *, wchar_t const *>(wchar_t *dest, wchar_t const *src, std::size_t n) noexcept(true) {
589  std::wmemcpy(dest, src, n);
590 }
591 
592 template<class Iter1, class Iter2> inline void FORCE_INLINE
593 memmove(Iter1 dest, Iter2 src, std::size_t n) noexcept(true) {
594  std::uninitialized_copy(src, src+n, dest);
595 }
596 
597 template<> inline void FORCE_INLINE
598 memmove<char *, char const *>(char *dest, char const *src, std::size_t n) noexcept(true) {
599  std::memmove(dest, src, n);
600 }
601 
602 template<> inline void FORCE_INLINE
603 memmove<wchar_t *, wchar_t const *>(wchar_t *dest, wchar_t const *src, std::size_t n) noexcept(true) {
604  std::wmemmove(dest, src, n);
605 }
606 
607 template<class Iter, class V> inline typename std::enable_if<std::is_same<typename std::iterator_traits<Iter>::value_type, V>::value>::type FORCE_INLINE
608 memset(Iter dest, V i, std::size_t n) noexcept(true) {
609  std::fill_n(dest, n, i);
610 }
611 
612 template<> inline void FORCE_INLINE
613 memset<char *, char>(char *dest, char i, std::size_t n) noexcept(true) {
614  std::memset(dest, i, n);
615 }
616 
617 template<> inline void FORCE_INLINE
618 memset<wchar_t *, wchar_t>(wchar_t *dest, wchar_t i, std::size_t n) noexcept(true) {
619  std::wmemset(dest, i, n);
620 }
621 
622 template<class Iter> inline bool FORCE_INLINE
623 memcmp(Iter src1, Iter src2, std::size_t n) noexcept(true) {
624  return std::equal(src1, src1+n, src2);
625 }
626 
627 template<> inline bool FORCE_INLINE
628 memcmp<char const *>(char const *src1, char const *src2, std::size_t n) noexcept(true) {
629  return std::memcmp(src1, src2, n)==0;
630 }
631 
632 template<> inline bool FORCE_INLINE
633 memcmp<wchar_t const *>(wchar_t const *src1, wchar_t const *src2, std::size_t n) noexcept(true) {
634  return std::wmemcmp(src1, src2, n)==0;
635 }
636 
637 template<class Val, std::size_t SrcSz, std::size_t DestSz> void FORCE_INLINE
638 memcpy(Val const (& src)[SrcSz], Val (& dest)[DestSz]) noexcept(false) {
639  enum : std::size_t {
640  smallest_buff=min<std::size_t, SrcSz, DestSz>::value
641  };
642  using unrolled_op_t=private_::unroll<smallest_buff>;
643  // Unroll the copy in the hope that the compiler will notice the sequence of copies and optimize it.
644  unrolled_op_t::result([&src, &dest](std::size_t i) {dest[i]=src[i];});
645 }
646 
647 template<
648  std::size_t SrcSz,
649  std::size_t DestSz
650 >
651 inline constexpr void FORCE_INLINE
652 memcpy_opt(char const (&src)[SrcSz], char (&dest)[DestSz]) noexcept(true) {
653  static_assert(SrcSz>0 && DestSz>0, "Buffers must be non-zero.");
654  using unrolled_512_op_t=private_::aligned_unroller<
655  SrcSz,
656  DestSz,
657 #ifdef __AVX512F__
658  __m512
659 #else
660  char[std::numeric_limits<short>::max()] // A type assuredly bigger than anything reasonable.
661 #endif
662  >;
663 #pragma GCC diagnostic push
664 #pragma GCC diagnostic ignored "-Wignored-attributes"
665  using unrolled_256_op_t=private_::aligned_unroller<
666  SrcSz-unrolled_512_op_t::end,
667  DestSz-unrolled_512_op_t::end,
668 #ifdef __AVX__
669  __m256
670 #else
671  char[std::numeric_limits<short>::max()] // A type assuredly bigger than anything reasonable.
672 #endif
673  >;
674  using unrolled_128_op_t=private_::aligned_unroller<
675  SrcSz-unrolled_256_op_t::end,
676  DestSz-unrolled_256_op_t::end,
677 #ifdef __SSE__
678  __m128
679 #else
680  char[std::numeric_limits<short>::max()] // A type assuredly bigger than anything reasonable.
681 #endif
682  >;
683 #pragma GCC diagnostic pop
684  using unrolled_64_op_t=private_::aligned_unroller<
685  SrcSz-unrolled_256_op_t::end-unrolled_128_op_t::end,
686  DestSz-unrolled_256_op_t::end-unrolled_128_op_t::end,
687  uint64_t
688  >;
689  using unrolled_32_op_t=private_::aligned_unroller<
690  SrcSz-unrolled_256_op_t::end-unrolled_128_op_t::end-unrolled_64_op_t::end,
691  DestSz-unrolled_256_op_t::end-unrolled_128_op_t::end-unrolled_64_op_t::end,
692  uint32_t
693  >;
694  using unrolled_16_op_t=private_::aligned_unroller<
695  SrcSz-unrolled_256_op_t::end-unrolled_128_op_t::end-unrolled_64_op_t::end-unrolled_32_op_t::end,
696  DestSz-unrolled_256_op_t::end-unrolled_128_op_t::end-unrolled_64_op_t::end-unrolled_32_op_t::end,
697  uint16_t
698  >;
699  using unrolled_8_op_t=private_::aligned_unroller<
700  SrcSz-unrolled_256_op_t::end-unrolled_128_op_t::end-unrolled_64_op_t::end-unrolled_32_op_t::end-unrolled_16_op_t::end,
701  DestSz-unrolled_256_op_t::end-unrolled_128_op_t::end-unrolled_64_op_t::end-unrolled_32_op_t::end-unrolled_16_op_t::end,
702  uint8_t
703  >;
704  // After all of the larger word sizes have been use we MUST end up with only zero or one byte to copy...
705  static_assert(
706  (
707  unrolled_512_op_t::smallest_buff-unrolled_512_op_t::end
708  -unrolled_256_op_t::end
709  -unrolled_128_op_t::end
710  -unrolled_64_op_t::end
711  -unrolled_32_op_t::end
712  -unrolled_16_op_t::end
713  )<=1,
714  "Oh b*ll*x. The unrolling meta-program is seriously fsck'd, please file a bug report."
715  );
716 
717  // Unroll the copy in the hope that the compiler will notice the sequence of copies and optimize it.
718 #ifdef __AVX512F__
719  unrolled_512_op_t::result(
720  [&src, &dest](std::size_t i) {
721  const typename unrolled_512_op_t::element_type tmp=_mm512_loadu_ps(&((src+unrolled_256_op_t::end)[i]));
722  _mm512_storeu_ps(&((dest+unrolled_256_op_t::end)[i]), tmp);
723  }
724  );
725 #endif
726 #ifdef __AVX__
727  unrolled_256_op_t::result(
728  [&src, &dest](std::size_t i) {
729  const typename unrolled_256_op_t::element_type tmp=_mm256_loadu_ps(reinterpret_cast<float const *>(&((src+unrolled_256_op_t::end)[i])));
730  _mm256_storeu_ps(reinterpret_cast<float *>(&((dest+unrolled_256_op_t::end)[i])), tmp);
731  }
732  );
733 #endif
734 #ifdef __SSE__
735  unrolled_128_op_t::result(
736  [&src, &dest](std::size_t i) {
737  const typename unrolled_128_op_t::element_type tmp=_mm_loadu_ps(reinterpret_cast<float const *>(&((src+unrolled_256_op_t::end)[i])));
738  _mm_storeu_ps(reinterpret_cast<float *>(&((dest+unrolled_256_op_t::end)[i])), tmp);
739  }
740  );
741 #endif
742  unrolled_64_op_t::result(
743  [&src, &dest](std::size_t i) __attribute__((no_sanitize("undefined"))) {
744  reinterpret_cast<uint64_t*>(dest+unrolled_256_op_t::end+unrolled_128_op_t::end)[i]=reinterpret_cast<uint64_t const *>(src+unrolled_256_op_t::end+unrolled_128_op_t::end)[i];
745  }
746  );
747  unrolled_32_op_t::result(
748  [&src, &dest](std::size_t i) __attribute__((no_sanitize("undefined"))) {
749  reinterpret_cast<uint32_t*>(dest+unrolled_256_op_t::end+unrolled_128_op_t::end+unrolled_64_op_t::end)[i]=
750  reinterpret_cast<uint32_t const *>(src+unrolled_256_op_t::end+unrolled_128_op_t::end+unrolled_64_op_t::end)[i];
751  }
752  );
753  unrolled_16_op_t::result(
754  [&src, &dest](std::size_t i) __attribute__((no_sanitize("undefined"))) {
755  reinterpret_cast<uint16_t*>(dest+unrolled_256_op_t::end+unrolled_128_op_t::end+unrolled_64_op_t::end+unrolled_32_op_t::end)[i]=
756  reinterpret_cast<uint16_t const *>(src+unrolled_256_op_t::end+unrolled_128_op_t::end+unrolled_64_op_t::end+unrolled_32_op_t::end)[i];
757  }
758  );
759  unrolled_8_op_t::result(
760  [&src, &dest](std::size_t i) __attribute__((no_sanitize("undefined"))) {
761  (dest+unrolled_256_op_t::end+unrolled_128_op_t::end+unrolled_64_op_t::end+unrolled_32_op_t::end+unrolled_16_op_t::end)[i]=
762  (src+unrolled_256_op_t::end+unrolled_128_op_t::end+unrolled_64_op_t::end+unrolled_32_op_t::end+unrolled_16_op_t::end)[i];
763  }
764  );
765 }
766 
767 template<
768  std::size_t FirstSz,
769  std::size_t SecondSz
770 >
771 inline constexpr bool FORCE_INLINE
772 memcmp_opt(char const (&first)[FirstSz], char const (&second)[SecondSz]) noexcept(true) {
773  static_assert(FirstSz>0 && SecondSz>0, "Buffers must be non-zero.");
774  if constexpr (FirstSz!=SecondSz) {
775  return false;
776  }
777  using unrolled_512_op_t=private_::aligned_unroller<
778  FirstSz,
779  SecondSz,
780 #ifdef __AVX512F__
781  __m512
782 #else
783  char[std::numeric_limits<short>::max()] // A type assuredly bigger than anything reasonable.
784 #endif
785  >;
786 #pragma GCC diagnostic push
787 #pragma GCC diagnostic ignored "-Wignored-attributes"
788  using unrolled_256_op_t=private_::aligned_unroller<
789  FirstSz-unrolled_512_op_t::end,
790  SecondSz-unrolled_512_op_t::end,
791 #ifdef __AVX__
792  __m256
793 #else
794  char[std::numeric_limits<short>::max()] // A type assuredly bigger than anything reasonable.
795 #endif
796  >;
797  using unrolled_128_op_t=private_::aligned_unroller<
798  FirstSz-unrolled_256_op_t::end,
799  SecondSz-unrolled_256_op_t::end,
800 #ifdef __SSE__
801  __m128
802 #else
803  char[std::numeric_limits<short>::max()] // A type assuredly bigger than anything reasonable.
804 #endif
805  >;
806 #pragma GCC diagnostic pop
807  using unrolled_64_op_t=private_::aligned_unroller<
808  FirstSz-unrolled_256_op_t::end-unrolled_128_op_t::end,
809  SecondSz-unrolled_256_op_t::end-unrolled_128_op_t::end,
810  uint64_t
811  >;
812  using unrolled_32_op_t=private_::aligned_unroller<
813  FirstSz-unrolled_256_op_t::end-unrolled_128_op_t::end-unrolled_64_op_t::end,
814  SecondSz-unrolled_256_op_t::end-unrolled_128_op_t::end-unrolled_64_op_t::end,
815  uint32_t
816  >;
817  using unrolled_16_op_t=private_::aligned_unroller<
818  FirstSz-unrolled_256_op_t::end-unrolled_128_op_t::end-unrolled_64_op_t::end-unrolled_32_op_t::end,
819  SecondSz-unrolled_256_op_t::end-unrolled_128_op_t::end-unrolled_64_op_t::end-unrolled_32_op_t::end,
820  uint16_t
821  >;
822  using unrolled_8_op_t=private_::aligned_unroller<
823  FirstSz-unrolled_256_op_t::end-unrolled_128_op_t::end-unrolled_64_op_t::end-unrolled_32_op_t::end-unrolled_16_op_t::end,
824  SecondSz-unrolled_256_op_t::end-unrolled_128_op_t::end-unrolled_64_op_t::end-unrolled_32_op_t::end-unrolled_16_op_t::end,
825  uint8_t
826  >;
827  // After all of the larger word sizes have been use we MUST end up with only zero or one byte to copy...
828  static_assert(
829  (
830  unrolled_256_op_t::smallest_buff-unrolled_256_op_t::end
831  -unrolled_128_op_t::end
832  -unrolled_64_op_t::end
833  -unrolled_32_op_t::end
834  -unrolled_16_op_t::end
835  )<=1,
836  "Oh b*ll*x. The unrolling meta-program is seriously fsck'd, please file a bug report."
837  );
838 
839  // Unroll the copy in the hope that the compiler will notice the sequence of copies and optimize it.
840 #ifdef __AVX512F__
841  const bool res512=unrolled_512_op_t::result_b(
842  [&first, &second](std::size_t i) {
843  const typename unrolled_512_op_t::element_type f=_mm512_loadu_ps(&((first+unrolled_256_op_t::end)[i]));
844  const typename unrolled_512_op_t::element_type s=_mm512_loadu_ps(&((second+unrolled_256_op_t::end)[i]));
845  return f==s;
846  }
847  );
848 #endif
849 #ifdef __AVX__
850  const bool res256=unrolled_256_op_t::result_b(
851 #ifdef __AVX512F__
852  res512 &&
853 #endif
854  [&first, &second](std::size_t i) {
855  const typename unrolled_256_op_t::element_type f=_mm256_loadu_ps(reinterpret_cast<float const *>(&((first+unrolled_256_op_t::end)[i])));
856  const typename unrolled_256_op_t::element_type s=_mm256_loadu_ps(reinterpret_cast<float const *>(&((second+unrolled_256_op_t::end)[i])));
857  return f==s;
858  }
859  );
860 #endif
861 #ifdef __SSE__
862  const bool res128=
863 #ifdef __AVX__
864  res256 &&
865 #endif
866  unrolled_128_op_t::result_b(
867  [&first, &second](std::size_t i) {
868  const typename unrolled_128_op_t::element_type f=_mm_loadu_ps(reinterpret_cast<float const *>(&((first+unrolled_256_op_t::end)[i])));
869  const typename unrolled_128_op_t::element_type s=_mm_loadu_ps(reinterpret_cast<float const *>(&((second+unrolled_256_op_t::end)[i])));
870  return f==s;
871  }
872  );
873 #endif
874  const bool res64=
875 #ifdef __SSE__
876  res128 &&
877 #endif
878  unrolled_64_op_t::result_b(
879  [&first, &second](std::size_t i) __attribute__((no_sanitize("undefined"))) {
880  return reinterpret_cast<uint64_t const *>(second+unrolled_256_op_t::end+unrolled_128_op_t::end)[i]==
881  reinterpret_cast<uint64_t const *>(first+unrolled_256_op_t::end+unrolled_128_op_t::end)[i];
882  }
883  );
884  const bool res32=res64 && unrolled_32_op_t::result_b(
885  [&first, &second](std::size_t i) __attribute__((no_sanitize("undefined"))) {
886  return reinterpret_cast<uint32_t const *>(second+unrolled_256_op_t::end+unrolled_128_op_t::end+unrolled_64_op_t::end)[i]==
887  reinterpret_cast<uint32_t const *>(first+unrolled_256_op_t::end+unrolled_128_op_t::end+unrolled_64_op_t::end)[i];
888  }
889  );
890  const bool res16=res32 && unrolled_16_op_t::result_b(
891  [&first, &second](std::size_t i) __attribute__((no_sanitize("undefined"))) {
892  return reinterpret_cast<uint16_t const *>(second+unrolled_256_op_t::end+unrolled_128_op_t::end+unrolled_64_op_t::end+unrolled_32_op_t::end)[i]==
893  reinterpret_cast<uint16_t const *>(first+unrolled_256_op_t::end+unrolled_128_op_t::end+unrolled_64_op_t::end+unrolled_32_op_t::end)[i];
894  }
895  );
896  const bool res8=res16 && unrolled_8_op_t::result_b(
897  [&first, &second](std::size_t i) __attribute__((no_sanitize("undefined"))) {
898  return (second+unrolled_256_op_t::end+unrolled_128_op_t::end+unrolled_64_op_t::end+unrolled_32_op_t::end+unrolled_16_op_t::end)[i]==
899  (first+unrolled_256_op_t::end+unrolled_128_op_t::end+unrolled_64_op_t::end+unrolled_32_op_t::end+unrolled_16_op_t::end)[i];
900  }
901  );
902  return res8;
903 }
904 
905 template<
906  char const needle,
907  std::size_t FirstSz
908 >
909 inline constexpr char const * FORCE_INLINE
910 strchr_opt(char const (&haystack)[FirstSz]) noexcept(true) {
911  static_assert(FirstSz>0, "Buffer must be non-zero.");
912  return private_::strchr_opt<
913  needle,
914  FirstSz,
915  typename private_::select_size<FirstSz>::type
916  >::result(haystack);
917 }
918 
919 template<
920  std::size_t FirstSz,
921  std::size_t SecondSz,
922  class LessThan32BytesLong
923 >
924 inline constexpr char const * FORCE_INLINE
925 strstr_opt(char const (&haystack)[FirstSz], char const (&needle)[SecondSz]) noexcept(true) {
926  static_assert(FirstSz>0 && SecondSz>1, "Buffers must be non-zero.");
927  return private_::strstr_opt<
928  FirstSz,
929  SecondSz,
930  typename private_::select_size<FirstSz>::type
931  >::result(haystack, needle);
932 }
933 
934 template<
935  std::size_t SrcSz,
936  std::size_t DestSz
937 >
938 inline constexpr void FORCE_INLINE
939 memcpy_opt(std::array<char, SrcSz> const &src, std::array<char, DestSz> &dest) noexcept(true) {
940  memcpy_opt(reinterpret_cast<char const (&)[SrcSz]>(*src.data()), reinterpret_cast<char (&)[DestSz]>(*dest.data()));
941 }
942 template<
943  std::size_t SrcSz,
944  std::size_t DestSz
945 >
946 inline constexpr void FORCE_INLINE
947 memcpy_opt(std::array<uint8_t, SrcSz> const &src, std::array<uint8_t, DestSz> &dest) noexcept(true) {
948  memcpy_opt(reinterpret_cast<char const (&)[SrcSz]>(*src.data()), reinterpret_cast<char (&)[DestSz]>(*dest.data()));
949 }
950 
951 template<
952  std::size_t Sz
953 >
954 inline bool
955 memcmp(std::array<char, Sz> const &src1, std::array<char, Sz> const &src2) noexcept(true) {
956  return std::memcmp(src1.data(), src2.data(), Sz)==0;
957 }
958 template<
959  std::size_t Sz
960 >
961 inline bool
962 memcmp(std::array<uint8_t, Sz> const &src1, std::array<uint8_t, Sz> const &src2) noexcept(true) {
963  return std::memcmp(src1.data(), src2.data(), Sz)==0;
964 }
965 
966 template<
967  std::size_t Sz
968 >
969 inline bool
970 operator==(std::array<char, Sz> const &src1, std::array<char, Sz> const &src2) noexcept(true) {
971  return memcmp(src1, src2);
972 }
973 template<
974  std::size_t Sz
975 >
976 inline bool
977 operator==(std::array<uint8_t, Sz> const &src1, std::array<uint8_t, Sz> const &src2) noexcept(true) {
978  return memcmp(src1, src2);
979 }
980 
981 template<
982  std::size_t SrcSz,
983  std::size_t DestSz
984 >
985 inline constexpr void
986 memcpy_slow(char const (& src)[SrcSz], char (& dest)[DestSz]) noexcept(true) {
987  static_assert(SrcSz>0 && DestSz>0, "Buffers must be non-zero.");
988  using unrolled_op_t=private_::unroll<min<std::size_t, SrcSz, DestSz>::value>;
989  // Unroll the copy in the hope that the compiler will notice the sequence of copies and optimize it.
990  unrolled_op_t::result([&src, &dest](std::size_t i) {dest[i]=src[i];});
991 }
992 
993 template<
994  class T,
995  class V
996 > inline T
997 copy(V const &src) noexcept(true) {
998  T dest;
999  const std::ptrdiff_t src_size=src.second-src.first;
1000  assert(src_size>0);
1001  const std::ptrdiff_t dest_size=std::min(static_cast<typename T::size_type>(src_size), dest.size());
1002  memcpy(dest.begin(), src.first, dest_size);
1003  std::fill_n(dest.begin()+dest_size, dest.size()-dest_size, '\0');
1004  return dest;
1005 }
1006 
1007 template<
1008  std::size_t SrcSz,
1009  std::size_t DestSz
1010 > inline constexpr std::array<char, DestSz>
1011 copy(std::array<char, SrcSz> const &src) noexcept(true) {
1012  std::array<char, DestSz> dest;
1013  static_assert(SrcSz>0 && DestSz>0, "Buffers must be non-zero.");
1014  using unrolled_op_t=private_::unroll<min<std::size_t, SrcSz, DestSz>::value>;
1015  // Unroll the copy in the hope that the compiler will notice the sequence of copies and optimize it.
1016  unrolled_op_t::result([&src, &dest](std::size_t i) {dest[i]=src[i];});
1017  return dest;
1018 }
1019 
1020 template<
1021  std::size_t SrcSz,
1022  std::size_t DestSz
1023 > inline constexpr std::array<uint8_t, DestSz>
1024 copy(std::array<uint8_t, SrcSz> const &src) noexcept(true) {
1025  std::array<uint8_t, DestSz> dest;
1026  static_assert(SrcSz>0 && DestSz>0, "Buffers must be non-zero.");
1027  using unrolled_op_t=private_::unroll<min<std::size_t, SrcSz, DestSz>::value>;
1028  // Unroll the copy in the hope that the compiler will notice the sequence of copies and optimize it.
1029  unrolled_op_t::result([&src, &dest](std::size_t i) {dest[i]=src[i];});
1030  return dest;
1031 }
1032 
1033 } }