40 const bool res=o.operator()(N-1);
41 return res ?
unroll<N-1>::result_b(
std::forward<Op>(o)) :
false;
59 template<
class T>
inline constexpr
62 return value & (value - 1);
65 template<
class T>
inline constexpr
70 return __builtin_ctz(value);
75 return __builtin_ctzl(value);
80 return count_trailing_zeroes_compat(value);
83 template<
class T>
inline constexpr
88 return __builtin_clz(value);
93 return __builtin_clzl(value);
98 return count_leading_zeroes_compat(value);
108 std::size_t Div=SmallestBuff/
sizeof(Unit),
109 std::size_t Rem=SmallestBuff%
sizeof(Unit)
112 using element_type=Unit;
123 static_assert(
src_sz>0 &&
dest_sz>0,
"Buffers must be non-zero.");
124 BOOST_MPL_ASSERT_RELATION(smallest_buff, >, 0);
125 BOOST_MPL_ASSERT_RELATION(
sizeof(Unit), >, 0);
126 BOOST_MPL_ASSERT_RELATION(div, >=, 0);
127 BOOST_MPL_ASSERT_RELATION(rem, >=, 0);
128 BOOST_MPL_ASSERT_RELATION(end, >=, 0);
130 BOOST_MPL_ASSERT_RELATION(rem, <=,
sizeof(Unit));
131 BOOST_MPL_ASSERT_RELATION((
sizeof(Unit)*div), <=, smallest_buff);
132 BOOST_MPL_ASSERT_RELATION(rem, <=, smallest_buff);
133 BOOST_MPL_ASSERT_RELATION(end, <=, smallest_buff);
140 unrolled_op_t::result(
std::forward<Op>(o));
146 return unrolled_op_t::result_b(
std::forward<Op>(o));
156 std::size_t SmallestBuff,
160 using element_type=Unit;
186 template<
std::size_t FirstSz>
193 #elif defined(__AVX__)
195 #elif defined(__SSE2__
)
203 #elif defined(__SSE2__
)
218 #elif defined(__AVX__)
220 #elif defined(__SSE2__
)
228 #elif defined(__SSE2__
)
241 using type=
typename std::conditional<
244 typename std::conditional<
247 typename std::conditional<
348 static constexpr char const *
FORCE_INLINE __attribute__((pure))
349 result(
char const *
const haystack)
noexcept(
true) {
350 using element_type=__m128i;
351 const auto strchr_opt_int=[&haystack]() ->
char const *
__attribute__((no_sanitize(
"address")))
__attribute__((pure)) {
352 const element_type conv=_mm_set1_epi8(needle);
353 const element_type block_first=_mm_loadu_si128(
reinterpret_cast<element_type
const *>(haystack));
354 const element_type eq_needle=_mm_cmpeq_epi8(conv, block_first);
355 const std::uint32_t mask=_mm_movemask_epi8(eq_needle);
357 const auto bitpos=
private_::
bits::count_trailing_zeroes(mask);
358 return haystack+bitpos;
364 const auto ret=strchr_opt_int();
368 constexpr const std::size_t next_portion=((FirstSz>=
sizeof(element_type)) ? (FirstSz-
sizeof(element_type)) : FirstSz);
373 typename select_size<next_portion>::type
374 >::result(haystack+
sizeof(element_type));
388 result(
char const *
const haystack)
noexcept(
true) {
389 return std::strchr(haystack, needle);
395 std::size_t SecondSz,
515 static constexpr char const *
FORCE_INLINE __attribute__((pure))
516 result(
char const *
const haystack,
char const (&needle)[SecondSz])
noexcept(
true) {
517 using element_type=__m128i;
518 const auto strstr_opt_int=[&haystack, &needle]() ->
char const *
__attribute__((no_sanitize(
"address")))
__attribute__((pure)) {
519 const element_type first=_mm_set1_epi8(needle[0]);
520 const element_type last=_mm_set1_epi8(needle[SecondSz-1]);
521 const element_type block_first=_mm_loadu_si128(
reinterpret_cast<element_type
const *>(haystack));
522 const element_type block_last=_mm_loadu_si128(
reinterpret_cast<element_type
const *>(haystack+SecondSz-1));
523 const element_type eq_first=_mm_cmpeq_epi8(first, block_first);
524 const element_type eq_last=_mm_cmpeq_epi8(last, block_last);
525 std::uint32_t mask=_mm_movemask_epi8(_mm_or_si128(eq_first, eq_last));
527 if constexpr ((SecondSz-2)>0) {
528 const auto bitpos=
private_::
bits::count_trailing_zeroes(mask);
529 if ((bitpos+1+SecondSz-2)<=FirstSz) {
530 using needle_cmp_t=
char const [SecondSz-2];
531 if (memcmp_opt(
reinterpret_cast<needle_cmp_t &>(haystack[bitpos+1]),
reinterpret_cast<needle_cmp_t &>(needle[1]))) {
532 return haystack+bitpos;
545 const auto ret=strstr_opt_int();
549 constexpr const std::size_t next_portion=((FirstSz>=
sizeof(element_type)) ? (FirstSz-
sizeof(element_type)) : FirstSz);
554 typename select_size<next_portion>::type
555 >::result(haystack+
sizeof(element_type), needle);
570 result(
char const *
const haystack,
char const (&needle)[SecondSz])
noexcept(
true) {
571 return std::strstr(haystack, needle);
577 template<
class Iter1,
class Iter2>
inline void FORCE_INLINE
578 memcpy(Iter1 dest, Iter2 src,
std::size_t n)
noexcept(
true) {
579 std::uninitialized_copy(src, src+n, dest);
583 memcpy<
char *,
char const *>(
char *dest,
char const *src,
std::size_t n)
noexcept(
true) {
584 std::memcpy(dest, src, n);
588 memcpy<
wchar_t *,
wchar_t const *>(
wchar_t *dest,
wchar_t const *src,
std::size_t n)
noexcept(
true) {
589 std::wmemcpy(dest, src, n);
592 template<
class Iter1,
class Iter2>
inline void FORCE_INLINE
593 memmove(Iter1 dest, Iter2 src,
std::size_t n)
noexcept(
true) {
594 std::uninitialized_copy(src, src+n, dest);
598 memmove<
char *,
char const *>(
char *dest,
char const *src,
std::size_t n)
noexcept(
true) {
599 std::memmove(dest, src, n);
603 memmove<
wchar_t *,
wchar_t const *>(
wchar_t *dest,
wchar_t const *src,
std::size_t n)
noexcept(
true) {
604 std::wmemmove(dest, src, n);
609 std::fill_n(dest, n, i);
613 memset<
char *,
char>(
char *dest,
char i,
std::size_t n)
noexcept(
true) {
614 std::memset(dest, i, n);
618 memset<
wchar_t *,
wchar_t>(
wchar_t *dest,
wchar_t i,
std::size_t n)
noexcept(
true) {
619 std::wmemset(dest, i, n);
623 memcmp(Iter src1, Iter src2,
std::size_t n)
noexcept(
true) {
624 return std::equal(src1, src1+n, src2);
628 memcmp<
char const *>(
char const *src1,
char const *src2,
std::size_t n)
noexcept(
true) {
629 return std::memcmp(src1, src2, n)==0;
633 memcmp<
wchar_t const *>(
wchar_t const *src1,
wchar_t const *src2,
std::size_t n)
noexcept(
true) {
634 return std::wmemcmp(src1, src2, n)==0;
638 memcpy(Val
const (& src)[SrcSz], Val (& dest)[DestSz])
noexcept(
false) {
640 smallest_buff=min<std::size_t, SrcSz, DestSz>::value
644 unrolled_op_t::result([&src, &dest](
std::size_t i) {dest[i]=src[i];});
652 memcpy_opt(
char const (&src)[SrcSz],
char (&dest)[DestSz])
noexcept(
true) {
653 static_assert(SrcSz>0 && DestSz>0,
"Buffers must be non-zero.");
654 using unrolled_512_op_t=private_::aligned_unroller<
660 char[std::numeric_limits<
short>::max()]
663 #pragma GCC diagnostic push
664 #pragma GCC diagnostic ignored "-Wignored-attributes"
665 using unrolled_256_op_t=private_::aligned_unroller<
666 SrcSz-unrolled_512_op_t::end,
667 DestSz-unrolled_512_op_t::end,
671 char[std::numeric_limits<
short>::max()]
674 using unrolled_128_op_t=private_::aligned_unroller<
675 SrcSz-unrolled_256_op_t::end,
676 DestSz-unrolled_256_op_t::end,
680 char[std::numeric_limits<
short>::max()]
683 #pragma GCC diagnostic pop
684 using unrolled_64_op_t=private_::aligned_unroller<
685 SrcSz-unrolled_256_op_t::end-unrolled_128_op_t::end,
686 DestSz-unrolled_256_op_t::end-unrolled_128_op_t::end,
689 using unrolled_32_op_t=private_::aligned_unroller<
690 SrcSz-unrolled_256_op_t::end-unrolled_128_op_t::end-unrolled_64_op_t::end,
691 DestSz-unrolled_256_op_t::end-unrolled_128_op_t::end-unrolled_64_op_t::end,
694 using unrolled_16_op_t=private_::aligned_unroller<
695 SrcSz-unrolled_256_op_t::end-unrolled_128_op_t::end-unrolled_64_op_t::end-unrolled_32_op_t::end,
696 DestSz-unrolled_256_op_t::end-unrolled_128_op_t::end-unrolled_64_op_t::end-unrolled_32_op_t::end,
699 using unrolled_8_op_t=private_::aligned_unroller<
700 SrcSz-unrolled_256_op_t::end-unrolled_128_op_t::end-unrolled_64_op_t::end-unrolled_32_op_t::end-unrolled_16_op_t::end,
701 DestSz-unrolled_256_op_t::end-unrolled_128_op_t::end-unrolled_64_op_t::end-unrolled_32_op_t::end-unrolled_16_op_t::end,
707 unrolled_512_op_t::smallest_buff-unrolled_512_op_t::end
708 -unrolled_256_op_t::end
709 -unrolled_128_op_t::end
710 -unrolled_64_op_t::end
711 -unrolled_32_op_t::end
712 -unrolled_16_op_t::end
714 "Oh b*ll*x. The unrolling meta-program is seriously fsck'd, please file a bug report."
719 unrolled_512_op_t::result(
720 [&src, &dest](std::size_t i) {
721 const typename unrolled_512_op_t::element_type tmp=_mm512_loadu_ps(&((src+unrolled_256_op_t::end)[i]));
722 _mm512_storeu_ps(&((dest+unrolled_256_op_t::end)[i]), tmp);
727 unrolled_256_op_t::result(
728 [&src, &dest](std::size_t i) {
729 const typename unrolled_256_op_t::element_type tmp=_mm256_loadu_ps(
reinterpret_cast<
float const *>(&((src+unrolled_256_op_t::end)[i])));
730 _mm256_storeu_ps(
reinterpret_cast<
float *>(&((dest+unrolled_256_op_t::end)[i])), tmp);
735 unrolled_128_op_t::result(
736 [&src, &dest](std::size_t i) {
737 const typename unrolled_128_op_t::element_type tmp=_mm_loadu_ps(
reinterpret_cast<
float const *>(&((src+unrolled_256_op_t::end)[i])));
738 _mm_storeu_ps(
reinterpret_cast<
float *>(&((dest+unrolled_256_op_t::end)[i])), tmp);
742 unrolled_64_op_t::result(
743 [&src, &dest](std::size_t i)
__attribute__((no_sanitize(
"undefined"))) {
744 reinterpret_cast<uint64_t*>(dest+unrolled_256_op_t::end+unrolled_128_op_t::end)[i]=
reinterpret_cast<uint64_t
const *>(src+unrolled_256_op_t::end+unrolled_128_op_t::end)[i];
747 unrolled_32_op_t::result(
748 [&src, &dest](std::size_t i)
__attribute__((no_sanitize(
"undefined"))) {
749 reinterpret_cast<uint32_t*>(dest+unrolled_256_op_t::end+unrolled_128_op_t::end+unrolled_64_op_t::end)[i]=
750 reinterpret_cast<uint32_t
const *>(src+unrolled_256_op_t::end+unrolled_128_op_t::end+unrolled_64_op_t::end)[i];
753 unrolled_16_op_t::result(
754 [&src, &dest](std::size_t i)
__attribute__((no_sanitize(
"undefined"))) {
755 reinterpret_cast<uint16_t*>(dest+unrolled_256_op_t::end+unrolled_128_op_t::end+unrolled_64_op_t::end+unrolled_32_op_t::end)[i]=
756 reinterpret_cast<uint16_t
const *>(src+unrolled_256_op_t::end+unrolled_128_op_t::end+unrolled_64_op_t::end+unrolled_32_op_t::end)[i];
759 unrolled_8_op_t::result(
760 [&src, &dest](std::size_t i)
__attribute__((no_sanitize(
"undefined"))) {
761 (dest+unrolled_256_op_t::end+unrolled_128_op_t::end+unrolled_64_op_t::end+unrolled_32_op_t::end+unrolled_16_op_t::end)[i]=
762 (src+unrolled_256_op_t::end+unrolled_128_op_t::end+unrolled_64_op_t::end+unrolled_32_op_t::end+unrolled_16_op_t::end)[i];
772 memcmp_opt(
char const (&first)[FirstSz],
char const (&second)[SecondSz])
noexcept(
true) {
773 static_assert(FirstSz>0 && SecondSz>0,
"Buffers must be non-zero.");
774 if constexpr (FirstSz!=SecondSz) {
777 using unrolled_512_op_t=private_::aligned_unroller<
783 char[std::numeric_limits<
short>::max()]
786 #pragma GCC diagnostic push
787 #pragma GCC diagnostic ignored "-Wignored-attributes"
788 using unrolled_256_op_t=private_::aligned_unroller<
789 FirstSz-unrolled_512_op_t::end,
790 SecondSz-unrolled_512_op_t::end,
794 char[std::numeric_limits<
short>::max()]
797 using unrolled_128_op_t=private_::aligned_unroller<
798 FirstSz-unrolled_256_op_t::end,
799 SecondSz-unrolled_256_op_t::end,
803 char[std::numeric_limits<
short>::max()]
806 #pragma GCC diagnostic pop
807 using unrolled_64_op_t=private_::aligned_unroller<
808 FirstSz-unrolled_256_op_t::end-unrolled_128_op_t::end,
809 SecondSz-unrolled_256_op_t::end-unrolled_128_op_t::end,
812 using unrolled_32_op_t=private_::aligned_unroller<
813 FirstSz-unrolled_256_op_t::end-unrolled_128_op_t::end-unrolled_64_op_t::end,
814 SecondSz-unrolled_256_op_t::end-unrolled_128_op_t::end-unrolled_64_op_t::end,
817 using unrolled_16_op_t=private_::aligned_unroller<
818 FirstSz-unrolled_256_op_t::end-unrolled_128_op_t::end-unrolled_64_op_t::end-unrolled_32_op_t::end,
819 SecondSz-unrolled_256_op_t::end-unrolled_128_op_t::end-unrolled_64_op_t::end-unrolled_32_op_t::end,
822 using unrolled_8_op_t=private_::aligned_unroller<
823 FirstSz-unrolled_256_op_t::end-unrolled_128_op_t::end-unrolled_64_op_t::end-unrolled_32_op_t::end-unrolled_16_op_t::end,
824 SecondSz-unrolled_256_op_t::end-unrolled_128_op_t::end-unrolled_64_op_t::end-unrolled_32_op_t::end-unrolled_16_op_t::end,
830 unrolled_256_op_t::smallest_buff-unrolled_256_op_t::end
831 -unrolled_128_op_t::end
832 -unrolled_64_op_t::end
833 -unrolled_32_op_t::end
834 -unrolled_16_op_t::end
836 "Oh b*ll*x. The unrolling meta-program is seriously fsck'd, please file a bug report."
841 const bool res512=unrolled_512_op_t::result_b(
842 [&first, &second](std::size_t i) {
843 const typename unrolled_512_op_t::element_type f=_mm512_loadu_ps(&((first+unrolled_256_op_t::end)[i]));
844 const typename unrolled_512_op_t::element_type s=_mm512_loadu_ps(&((second+unrolled_256_op_t::end)[i]));
850 const bool res256=unrolled_256_op_t::result_b(
854 [&first, &second](std::size_t i) {
855 const typename unrolled_256_op_t::element_type f=_mm256_loadu_ps(
reinterpret_cast<
float const *>(&((first+unrolled_256_op_t::end)[i])));
856 const typename unrolled_256_op_t::element_type s=_mm256_loadu_ps(
reinterpret_cast<
float const *>(&((second+unrolled_256_op_t::end)[i])));
866 unrolled_128_op_t::result_b(
867 [&first, &second](std::size_t i) {
868 const typename unrolled_128_op_t::element_type f=_mm_loadu_ps(
reinterpret_cast<
float const *>(&((first+unrolled_256_op_t::end)[i])));
869 const typename unrolled_128_op_t::element_type s=_mm_loadu_ps(
reinterpret_cast<
float const *>(&((second+unrolled_256_op_t::end)[i])));
878 unrolled_64_op_t::result_b(
879 [&first, &second](std::size_t i)
__attribute__((no_sanitize(
"undefined"))) {
880 return reinterpret_cast<uint64_t
const *>(second+unrolled_256_op_t::end+unrolled_128_op_t::end)[i]==
881 reinterpret_cast<uint64_t
const *>(first+unrolled_256_op_t::end+unrolled_128_op_t::end)[i];
884 const bool res32=res64 && unrolled_32_op_t::result_b(
885 [&first, &second](std::size_t i)
__attribute__((no_sanitize(
"undefined"))) {
886 return reinterpret_cast<uint32_t
const *>(second+unrolled_256_op_t::end+unrolled_128_op_t::end+unrolled_64_op_t::end)[i]==
887 reinterpret_cast<uint32_t
const *>(first+unrolled_256_op_t::end+unrolled_128_op_t::end+unrolled_64_op_t::end)[i];
890 const bool res16=res32 && unrolled_16_op_t::result_b(
891 [&first, &second](std::size_t i)
__attribute__((no_sanitize(
"undefined"))) {
892 return reinterpret_cast<uint16_t
const *>(second+unrolled_256_op_t::end+unrolled_128_op_t::end+unrolled_64_op_t::end+unrolled_32_op_t::end)[i]==
893 reinterpret_cast<uint16_t
const *>(first+unrolled_256_op_t::end+unrolled_128_op_t::end+unrolled_64_op_t::end+unrolled_32_op_t::end)[i];
896 const bool res8=res16 && unrolled_8_op_t::result_b(
897 [&first, &second](std::size_t i)
__attribute__((no_sanitize(
"undefined"))) {
898 return (second+unrolled_256_op_t::end+unrolled_128_op_t::end+unrolled_64_op_t::end+unrolled_32_op_t::end+unrolled_16_op_t::end)[i]==
899 (first+unrolled_256_op_t::end+unrolled_128_op_t::end+unrolled_64_op_t::end+unrolled_32_op_t::end+unrolled_16_op_t::end)[i];
911 static_assert(FirstSz>0,
"Buffer must be non-zero.");
921 std::size_t SecondSz,
922 class LessThan32BytesLong
925 strstr_opt(
char const (&haystack)[FirstSz],
char const (&needle)[SecondSz])
noexcept(
true) {
926 static_assert(FirstSz>0 && SecondSz>1,
"Buffers must be non-zero.");
931 >::result(haystack, needle);
939 memcpy_opt(
std::array<
char, SrcSz>
const &src,
std::array<
char, DestSz> &dest)
noexcept(
true) {
940 memcpy_opt(
reinterpret_cast<
char const (&)[SrcSz]>(*src.data()),
reinterpret_cast<
char (&)[DestSz]>(*dest.data()));
947 memcpy_opt(
std::array<uint8_t, SrcSz>
const &src,
std::array<uint8_t, DestSz> &dest)
noexcept(
true) {
948 memcpy_opt(
reinterpret_cast<
char const (&)[SrcSz]>(*src.data()),
reinterpret_cast<
char (&)[DestSz]>(*dest.data()));
955 memcmp(
std::array<
char, Sz>
const &src1,
std::array<
char, Sz>
const &src2)
noexcept(
true) {
956 return std::memcmp(src1.data(), src2.data(), Sz)==0;
962 memcmp(
std::array<uint8_t, Sz>
const &src1,
std::array<uint8_t, Sz>
const &src2)
noexcept(
true) {
963 return std::memcmp(src1.data(), src2.data(), Sz)==0;
970 operator==(
std::array<
char, Sz>
const &src1,
std::array<
char, Sz>
const &src2)
noexcept(
true) {
971 return memcmp(src1, src2);
977 operator==(
std::array<uint8_t, Sz>
const &src1,
std::array<uint8_t, Sz>
const &src2)
noexcept(
true) {
978 return memcmp(src1, src2);
985 inline constexpr void
986 memcpy_slow(
char const (& src)[SrcSz],
char (& dest)[DestSz])
noexcept(
true) {
987 static_assert(SrcSz>0 && DestSz>0,
"Buffers must be non-zero.");
988 using unrolled_op_t=private_::unroll<min<std::size_t, SrcSz, DestSz>::value>;
990 unrolled_op_t::result([&src, &dest](std::size_t i) {dest[i]=src[i];});
997 copy(V
const &src)
noexcept(
true) {
999 const std::ptrdiff_t src_size=src.second-src.first;
1001 const std::ptrdiff_t dest_size=
std::min(
static_cast<
typename T::size_type>(src_size), dest.size());
1002 memcpy(dest.begin(), src.first, dest_size);
1003 std::fill_n(dest.begin()+dest_size, dest.size()-dest_size,
'\0');
1010 >
inline constexpr std::array<
char, DestSz>
1011 copy(
std::array<
char, SrcSz>
const &src)
noexcept(
true) {
1012 std::array<
char, DestSz> dest;
1013 static_assert(SrcSz>0 && DestSz>0,
"Buffers must be non-zero.");
1014 using unrolled_op_t=private_::unroll<min<std::size_t, SrcSz, DestSz>::value>;
1016 unrolled_op_t::result([&src, &dest](std::size_t i) {dest[i]=src[i];});
1023 >
inline constexpr std::array<uint8_t, DestSz>
1024 copy(
std::array<uint8_t, SrcSz>
const &src)
noexcept(
true) {
1025 std::array<uint8_t, DestSz> dest;
1026 static_assert(SrcSz>0 && DestSz>0,
"Buffers must be non-zero.");
1027 using unrolled_op_t=private_::unroll<min<std::size_t, SrcSz, DestSz>::value>;
1029 unrolled_op_t::result([&src, &dest](std::size_t i) {dest[i]=src[i];});