@@ -3224,6 +3224,53 @@ namespace {
32243224_Advance_bytes (_First,32 );
32253225 }while (_First != _Stop_at);
32263226
3227+
3228+ if (const size_t _Tail = _Length &0x1C ; _Tail !=0 ) {
3229+ const __m256i _Tail_mask =_Avx2_tail_mask_32 (_Tail);
3230+ const __m256i _Data =_mm256_maskload_epi32 (reinterpret_cast <const int *>(_First), _Tail_mask);
3231+
3232+ const __m256i _Cmp =_Traits::_Cmp_avx (_Comparand, _Data);
3233+ const uint32_t _Mask =_mm256_movemask_epi8 (_mm256_and_si256 (_Cmp, _Tail_mask));
3234+
3235+ uint64_t _Msk_with_carry =uint64_t {_Carry} | (uint64_t {_Mask} <<32 );
3236+ uint64_t _MskX = _Msk_with_carry;
3237+
3238+ _MskX = (_MskX >>sizeof (_Ty)) & _MskX;
3239+
3240+ if constexpr (sizeof (_Ty) ==1 ) {
3241+ _MskX =__ull_rshift (_MskX, _Sh1) & _MskX;
3242+ }
3243+
3244+ if constexpr (sizeof (_Ty) <4 ) {
3245+ _MskX =__ull_rshift (_MskX, _Sh2) & _MskX;
3246+ }
3247+
3248+ if constexpr (sizeof (_Ty) <8 ) {
3249+ _MskX =__ull_rshift (_MskX, _Sh3) & _MskX;
3250+ }
3251+
3252+ if (_MskX !=0 ) {
3253+ #ifdef _M_IX86
3254+ const uint32_t _MskLow =static_cast <uint32_t >(_MskX);
3255+
3256+ const int _Shift = _MskLow !=0
3257+ ?static_cast <int >(_tzcnt_u32 (_MskLow)) -32
3258+ :static_cast <int >(_tzcnt_u32 (static_cast <uint32_t >(_MskX >>32 )));
3259+
3260+ #elifdef _M_X64
3261+ const long long _Shift =static_cast <long long >(_tzcnt_u64 (_MskX)) -32 ;
3262+ #else
3263+ #error Unsupported architecture
3264+ #endif
3265+ _Advance_bytes (_First, _Shift);
3266+ return _First;
3267+ }
3268+
3269+ _Carry =static_cast <uint32_t >(__ull_rshift (_Msk_with_carry,static_cast <int >(_Tail)));
3270+
3271+ _Advance_bytes (_First, _Tail);
3272+ }
3273+
32273274 _Mid1 =static_cast <const _Ty*>(_First);
32283275_Rewind_bytes (_First,_lzcnt_u32 (~_Carry));
32293276 }else if constexpr (sizeof (_Ty) <8 ) {