@@ -523,9 +523,208 @@ __declspec(noalias) void __stdcall __std_rotate(void* _First, void* const _Mid,
523523
524524}// extern "C"
525525
526- #ifndef _M_ARM64
527526namespace {
528527namespace _Reversing {
528+ template <class _BidIt >
529+ void _Reverse_tail (_BidIt _First, _BidIt _Last)noexcept {
530+ for (; _First != _Last && _First != --_Last; ++_First) {
531+ const auto _Temp = *_First;
532+ *_First = *_Last;
533+ *_Last = _Temp;
534+ }
535+ }
536+
537+ template <class _BidIt ,class _OutIt >
538+ void _Reverse_copy_tail (const _BidIt _First, _BidIt _Last, _OutIt _Dest)noexcept {
539+ while (_First != _Last) {
540+ *_Dest++ = *--_Last;
541+ }
542+ }
543+
544+ #ifdef _M_ARM64
545+ struct _Traits_1 {
546+ static uint8x8_t _Rev (const uint8x8_t _Val)noexcept {
547+ return vrev64_u8 (_Val);
548+ }
549+
550+ static uint8x16_t _Rev (const uint8x16_t _Val)noexcept {
551+ const uint8x16_t _Rev_val =vrev64q_u8 (_Val);
552+ return vextq_u8 (_Rev_val, _Rev_val,8 );
553+ }
554+ };
555+
556+ struct _Traits_2 {
557+ static uint8x8_t _Rev (const uint8x8_t _Val)noexcept {
558+ return vreinterpret_u8_u16 (vrev64_u16 (vreinterpret_u16_u8 (_Val)));
559+ }
560+
561+ static uint8x16_t _Rev (const uint8x16_t _Val)noexcept {
562+ const uint8x16_t _Rev_val =vreinterpretq_u8_u16 (vrev64q_u16 (vreinterpretq_u16_u8 (_Val)));
563+ return vextq_u8 (_Rev_val, _Rev_val,8 );
564+ }
565+ };
566+
567+ struct _Traits_4 {
568+ static uint8x8_t _Rev (const uint8x8_t _Val)noexcept {
569+ return vreinterpret_u8_u32 (vrev64_u32 (vreinterpret_u32_u8 (_Val)));
570+ }
571+
572+ static uint8x16_t _Rev (const uint8x16_t _Val)noexcept {
573+ const uint8x16_t _Rev_val =vreinterpretq_u8_u32 (vrev64q_u32 (vreinterpretq_u32_u8 (_Val)));
574+ return vextq_u8 (_Rev_val, _Rev_val,8 );
575+ }
576+ };
577+
578+ struct _Traits_8 {
579+ static uint8x8_t _Rev (const uint8x8_t _Val)noexcept {
580+ return _Val;
581+ }
582+
583+ static uint8x16_t _Rev (const uint8x16_t _Val)noexcept {
584+ return vextq_u8 (_Val, _Val,8 );
585+ }
586+ };
587+
588+ template <class _Traits ,class _Ty >
589+ __declspec (noalias) void __cdecl _Reverse_impl(void * _First,void * _Last)noexcept {
590+ if (const size_t _Length =_Byte_length (_First, _Last); _Length >=64 ) {
591+ const void * _Stop_at = _First;
592+ constexpr size_t _Mask_32 = ~((static_cast <size_t >(1 ) <<5 ) -1 );
593+ _Advance_bytes (_Stop_at, (_Length >>1 ) & _Mask_32);
594+ do {
595+ _Advance_bytes (_Last, -32 );
596+
597+ const uint8x16_t _Left1 =vld1q_u8 (static_cast <uint8_t *>(_First) +0 );
598+ const uint8x16_t _Left2 =vld1q_u8 (static_cast <uint8_t *>(_First) +16 );
599+ const uint8x16_t _Right1 =vld1q_u8 (static_cast <uint8_t *>(_Last) +0 );
600+ const uint8x16_t _Right2 =vld1q_u8 (static_cast <uint8_t *>(_Last) +16 );
601+
602+ const uint8x16_t _Left1_reversed =_Traits::_Rev (_Left1);
603+ const uint8x16_t _Left2_reversed =_Traits::_Rev (_Left2);
604+ const uint8x16_t _Right1_reversed =_Traits::_Rev (_Right1);
605+ const uint8x16_t _Right2_reversed =_Traits::_Rev (_Right2);
606+
607+ vst1q_u8 (static_cast <uint8_t *>(_First) +0 , _Right2_reversed);
608+ vst1q_u8 (static_cast <uint8_t *>(_First) +16 , _Right1_reversed);
609+ vst1q_u8 (static_cast <uint8_t *>(_Last) +0 , _Left2_reversed);
610+ vst1q_u8 (static_cast <uint8_t *>(_Last) +16 , _Left1_reversed);
611+
612+ _Advance_bytes (_First,32 );
613+ }while (_First != _Stop_at);
614+ }
615+
616+ if (const size_t _Length =_Byte_length (_First, _Last); _Length >=32 ) {
617+ _Advance_bytes (_Last, -16 );
618+ const uint8x16_t _Left =vld1q_u8 (static_cast <uint8_t *>(_First));
619+ const uint8x16_t _Right =vld1q_u8 (static_cast <uint8_t *>(_Last));
620+
621+ const uint8x16_t _Left_reversed =_Traits::_Rev (_Left);
622+ const uint8x16_t _Right_reversed =_Traits::_Rev (_Right);
623+
624+ vst1q_u8 (static_cast <uint8_t *>(_First), _Right_reversed);
625+ vst1q_u8 (static_cast <uint8_t *>(_Last), _Left_reversed);
626+ _Advance_bytes (_First,16 );
627+ }
628+
629+ if (const size_t _Length =_Byte_length (_First, _Last); _Length >=16 ) {
630+ _Advance_bytes (_Last, -8 );
631+ const uint8x8_t _Left =vld1_u8 (static_cast <uint8_t *>(_First));
632+ const uint8x8_t _Right =vld1_u8 (static_cast <uint8_t *>(_Last));
633+
634+ const uint8x8_t _Left_reversed =_Traits::_Rev (_Left);
635+ const uint8x8_t _Right_reversed =_Traits::_Rev (_Right);
636+
637+ vst1_u8 (static_cast <uint8_t *>(_First), _Right_reversed);
638+ vst1_u8 (static_cast <uint8_t *>(_Last), _Left_reversed);
639+ _Advance_bytes (_First,8 );
640+ }
641+
642+ if constexpr (sizeof (_Ty) <8 ) {
643+ if (const size_t _Length =_Byte_length (_First, _Last); _Length >=8 ) {
644+ _Advance_bytes (_Last, -8 );
645+
646+ // Intentional overlapped loads/stores: read both sides first, then write.
647+ const uint8x8_t _Left =vld1_u8 (static_cast <uint8_t *>(_First));
648+ const uint8x8_t _Right =vld1_u8 (static_cast <uint8_t *>(_Last));
649+
650+ const uint8x8_t _Left_reversed =_Traits::_Rev (_Left);
651+ const uint8x8_t _Right_reversed =_Traits::_Rev (_Right);
652+
653+ vst1_u8 (static_cast <uint8_t *>(_First), _Right_reversed);
654+ vst1_u8 (static_cast <uint8_t *>(_Last), _Left_reversed);
655+
656+ // Overlapped stores cover any 8-15B remainder, so do not fall through to scalar tail.
657+ return ;
658+ }
659+ }
660+
661+ if constexpr (sizeof (_Ty) <4 ) {
662+ _Reverse_tail (static_cast <_Ty*>(_First),static_cast <_Ty*>(_Last));
663+ }
664+ }
665+
666+ template <class _Traits ,class _Ty >
667+ __declspec (noalias) void __cdecl _Reverse_copy_impl(
668+ const void * _First,const void * _Last,void * _Dest)noexcept {
669+ if (const size_t _Length =_Byte_length (_First, _Last); _Length >=64 ) {
670+ const void * _Stop_at = _Dest;
671+ constexpr size_t _Mask_64 = ~((static_cast <size_t >(1 ) <<6 ) -1 );
672+ _Advance_bytes (_Stop_at, _Length & _Mask_64);
673+ do {
674+ _Advance_bytes (_Last, -64 );
675+ const uint8x16_t _Block1 =vld1q_u8 (static_cast <const uint8_t *>(_Last) +0 );
676+ const uint8x16_t _Block2 =vld1q_u8 (static_cast <const uint8_t *>(_Last) +16 );
677+ const uint8x16_t _Block3 =vld1q_u8 (static_cast <const uint8_t *>(_Last) +32 );
678+ const uint8x16_t _Block4 =vld1q_u8 (static_cast <const uint8_t *>(_Last) +48 );
679+
680+ const uint8x16_t _Block1_reversed =_Traits::_Rev (_Block1);
681+ const uint8x16_t _Block2_reversed =_Traits::_Rev (_Block2);
682+ const uint8x16_t _Block3_reversed =_Traits::_Rev (_Block3);
683+ const uint8x16_t _Block4_reversed =_Traits::_Rev (_Block4);
684+
685+ vst1q_u8 (static_cast <uint8_t *>(_Dest) +0 , _Block4_reversed);
686+ vst1q_u8 (static_cast <uint8_t *>(_Dest) +16 , _Block3_reversed);
687+ vst1q_u8 (static_cast <uint8_t *>(_Dest) +32 , _Block2_reversed);
688+ vst1q_u8 (static_cast <uint8_t *>(_Dest) +48 , _Block1_reversed);
689+ _Advance_bytes (_Dest,64 );
690+ }while (_Dest != _Stop_at);
691+ }
692+
693+ if (const size_t _Length =_Byte_length (_First, _Last); _Length >=32 ) {
694+ _Advance_bytes (_Last, -32 );
695+ const uint8x16_t _Block1 =vld1q_u8 (static_cast <const uint8_t *>(_Last) +0 );
696+ const uint8x16_t _Block2 =vld1q_u8 (static_cast <const uint8_t *>(_Last) +16 );
697+
698+ const uint8x16_t _Block1_reversed =_Traits::_Rev (_Block1);
699+ const uint8x16_t _Block2_reversed =_Traits::_Rev (_Block2);
700+
701+ vst1q_u8 (static_cast <uint8_t *>(_Dest) +0 , _Block2_reversed);
702+ vst1q_u8 (static_cast <uint8_t *>(_Dest) +16 , _Block1_reversed);
703+ _Advance_bytes (_Dest,32 );
704+ }
705+
706+ if (const size_t _Length =_Byte_length (_First, _Last); _Length >=16 ) {
707+ _Advance_bytes (_Last, -16 );
708+ const uint8x16_t _Block =vld1q_u8 (static_cast <const uint8_t *>(_Last));
709+ const uint8x16_t _Block_reversed =_Traits::_Rev (_Block);
710+ vst1q_u8 (static_cast <uint8_t *>(_Dest), _Block_reversed);
711+ _Advance_bytes (_Dest,16 );
712+ }
713+
714+ if (const size_t _Length =_Byte_length (_First, _Last); _Length >=8 ) {
715+ _Advance_bytes (_Last, -8 );
716+ const uint8x8_t _Block =vld1_u8 (static_cast <const uint8_t *>(_Last));
717+ const uint8x8_t _Block_reversed =_Traits::_Rev (_Block);
718+ vst1_u8 (static_cast <uint8_t *>(_Dest), _Block_reversed);
719+ _Advance_bytes (_Dest,8 );
720+ }
721+
722+ if constexpr (sizeof (_Ty) <8 ) {
723+ _Reverse_copy_tail (
724+ static_cast <const _Ty*>(_First),static_cast <const _Ty*>(_Last),static_cast <_Ty*>(_Dest));
725+ }
726+ }
727+ #else // ^^^ defined(_M_ARM64) / !defined(_M_ARM64) vvv
529728#ifdef _M_ARM64EC
530729using _Traits_1 =void ;
531730using _Traits_2 =void ;
@@ -586,22 +785,6 @@ namespace {
586785 };
587786#endif // ^^^ !defined(_M_ARM64EC) ^^^
588787
589- template <class _BidIt >
590- void _Reverse_tail (_BidIt _First, _BidIt _Last)noexcept {
591- for (; _First != _Last && _First != --_Last; ++_First) {
592- const auto _Temp = *_First;
593- *_First = *_Last;
594- *_Last = _Temp;
595- }
596- }
597-
598- template <class _BidIt ,class _OutIt >
599- void _Reverse_copy_tail (const _BidIt _First, _BidIt _Last, _OutIt _Dest)noexcept {
600- while (_First != _Last) {
601- *_Dest++ = *--_Last;
602- }
603- }
604-
605788#ifndef _M_ARM64EC
606789 __m256i_Avx2_rev_tail_mask_32 (const size_t _Count_in_bytes)noexcept {
607790// _Count_in_bytes must be within [0, 32].
@@ -700,6 +883,7 @@ namespace {
700883_Reverse_copy_tail (
701884static_cast <const _Ty*>(_First),static_cast <const _Ty*>(_Last),static_cast <_Ty*>(_Dest));
702885 }
886+ #endif // ^^^ !defined(_M_ARM64) ^^^
703887 }// namespace _Reversing
704888}// unnamed namespace
705889
@@ -743,6 +927,7 @@ __declspec(noalias) void __cdecl __std_reverse_copy_trivially_copyable_8(
743927
744928}// extern "C"
745929
930+ #ifndef _M_ARM64
746931namespace {
747932namespace _Sorting {
748933enum _Min_max_mode {