Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitee05a89

Browse files
Add Neon implementations of std::reverse, std::reverse_copy (#5870)
Co-authored-by: Stephan T. Lavavej <stl@microsoft.com>
1 parent5ad3cb0 commitee05a89

File tree

2 files changed

+204
-19
lines changed

2 files changed

+204
-19
lines changed

‎stl/inc/xutility‎

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -95,8 +95,8 @@ _STL_DISABLE_CLANG_WARNINGS
9595
#define _VECTORIZED_REMOVE _VECTORIZED_FOR_X64_X86
9696
#define _VECTORIZED_REMOVE_COPY _VECTORIZED_FOR_X64_X86
9797
#define _VECTORIZED_REPLACE _VECTORIZED_FOR_X64_X86
98-
#define _VECTORIZED_REVERSE_VECTORIZED_FOR_X64_X86
99-
#define _VECTORIZED_REVERSE_COPY_VECTORIZED_FOR_X64_X86
98+
#define _VECTORIZED_REVERSE_VECTORIZED_FOR_X64_X86_ARM64
99+
#define _VECTORIZED_REVERSE_COPY_VECTORIZED_FOR_X64_X86_ARM64
100100
#define _VECTORIZED_ROTATE _VECTORIZED_FOR_X64_X86_ARM64
101101
#define _VECTORIZED_SEARCH _VECTORIZED_FOR_X64_X86
102102
#define _VECTORIZED_SEARCH_N _VECTORIZED_FOR_X64_X86

‎stl/src/vector_algorithms.cpp‎

Lines changed: 202 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -523,9 +523,208 @@ __declspec(noalias) void __stdcall __std_rotate(void* _First, void* const _Mid,
523523

524524
}// extern "C"
525525

526-
#ifndef _M_ARM64
527526
namespace {
528527
namespace_Reversing {
528+
template<class_BidIt>
529+
void_Reverse_tail(_BidIt _First, _BidIt _Last)noexcept {
530+
for (; _First != _Last && _First != --_Last; ++_First) {
531+
constauto _Temp = *_First;
532+
*_First = *_Last;
533+
*_Last = _Temp;
534+
}
535+
}
536+
537+
template<class_BidIt,class_OutIt>
538+
void_Reverse_copy_tail(const _BidIt _First, _BidIt _Last, _OutIt _Dest)noexcept {
539+
while (_First != _Last) {
540+
*_Dest++ = *--_Last;
541+
}
542+
}
543+
544+
#ifdef _M_ARM64
545+
struct_Traits_1 {
546+
staticuint8x8_t_Rev(constuint8x8_t _Val)noexcept {
547+
returnvrev64_u8(_Val);
548+
}
549+
550+
staticuint8x16_t_Rev(constuint8x16_t _Val)noexcept {
551+
constuint8x16_t _Rev_val =vrev64q_u8(_Val);
552+
returnvextq_u8(_Rev_val, _Rev_val,8);
553+
}
554+
};
555+
556+
struct_Traits_2 {
557+
staticuint8x8_t_Rev(constuint8x8_t _Val)noexcept {
558+
returnvreinterpret_u8_u16(vrev64_u16(vreinterpret_u16_u8(_Val)));
559+
}
560+
561+
staticuint8x16_t_Rev(constuint8x16_t _Val)noexcept {
562+
constuint8x16_t _Rev_val =vreinterpretq_u8_u16(vrev64q_u16(vreinterpretq_u16_u8(_Val)));
563+
returnvextq_u8(_Rev_val, _Rev_val,8);
564+
}
565+
};
566+
567+
struct_Traits_4 {
568+
staticuint8x8_t_Rev(constuint8x8_t _Val)noexcept {
569+
returnvreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(_Val)));
570+
}
571+
572+
staticuint8x16_t_Rev(constuint8x16_t _Val)noexcept {
573+
constuint8x16_t _Rev_val =vreinterpretq_u8_u32(vrev64q_u32(vreinterpretq_u32_u8(_Val)));
574+
returnvextq_u8(_Rev_val, _Rev_val,8);
575+
}
576+
};
577+
578+
struct_Traits_8 {
579+
staticuint8x8_t_Rev(constuint8x8_t _Val)noexcept {
580+
return _Val;
581+
}
582+
583+
staticuint8x16_t_Rev(constuint8x16_t _Val)noexcept {
584+
returnvextq_u8(_Val, _Val,8);
585+
}
586+
};
587+
588+
template<class_Traits,class_Ty>
589+
__declspec(noalias) void __cdecl _Reverse_impl(void* _First,void* _Last)noexcept {
590+
if (constsize_t _Length =_Byte_length(_First, _Last); _Length >=64) {
591+
constvoid* _Stop_at = _First;
592+
constexprsize_t _Mask_32 = ~((static_cast<size_t>(1) <<5) -1);
593+
_Advance_bytes(_Stop_at, (_Length >>1) & _Mask_32);
594+
do {
595+
_Advance_bytes(_Last, -32);
596+
597+
constuint8x16_t _Left1 =vld1q_u8(static_cast<uint8_t*>(_First) +0);
598+
constuint8x16_t _Left2 =vld1q_u8(static_cast<uint8_t*>(_First) +16);
599+
constuint8x16_t _Right1 =vld1q_u8(static_cast<uint8_t*>(_Last) +0);
600+
constuint8x16_t _Right2 =vld1q_u8(static_cast<uint8_t*>(_Last) +16);
601+
602+
constuint8x16_t _Left1_reversed =_Traits::_Rev(_Left1);
603+
constuint8x16_t _Left2_reversed =_Traits::_Rev(_Left2);
604+
constuint8x16_t _Right1_reversed =_Traits::_Rev(_Right1);
605+
constuint8x16_t _Right2_reversed =_Traits::_Rev(_Right2);
606+
607+
vst1q_u8(static_cast<uint8_t*>(_First) +0, _Right2_reversed);
608+
vst1q_u8(static_cast<uint8_t*>(_First) +16, _Right1_reversed);
609+
vst1q_u8(static_cast<uint8_t*>(_Last) +0, _Left2_reversed);
610+
vst1q_u8(static_cast<uint8_t*>(_Last) +16, _Left1_reversed);
611+
612+
_Advance_bytes(_First,32);
613+
}while (_First != _Stop_at);
614+
}
615+
616+
if (constsize_t _Length =_Byte_length(_First, _Last); _Length >=32) {
617+
_Advance_bytes(_Last, -16);
618+
constuint8x16_t _Left =vld1q_u8(static_cast<uint8_t*>(_First));
619+
constuint8x16_t _Right =vld1q_u8(static_cast<uint8_t*>(_Last));
620+
621+
constuint8x16_t _Left_reversed =_Traits::_Rev(_Left);
622+
constuint8x16_t _Right_reversed =_Traits::_Rev(_Right);
623+
624+
vst1q_u8(static_cast<uint8_t*>(_First), _Right_reversed);
625+
vst1q_u8(static_cast<uint8_t*>(_Last), _Left_reversed);
626+
_Advance_bytes(_First,16);
627+
}
628+
629+
if (constsize_t _Length =_Byte_length(_First, _Last); _Length >=16) {
630+
_Advance_bytes(_Last, -8);
631+
constuint8x8_t _Left =vld1_u8(static_cast<uint8_t*>(_First));
632+
constuint8x8_t _Right =vld1_u8(static_cast<uint8_t*>(_Last));
633+
634+
constuint8x8_t _Left_reversed =_Traits::_Rev(_Left);
635+
constuint8x8_t _Right_reversed =_Traits::_Rev(_Right);
636+
637+
vst1_u8(static_cast<uint8_t*>(_First), _Right_reversed);
638+
vst1_u8(static_cast<uint8_t*>(_Last), _Left_reversed);
639+
_Advance_bytes(_First,8);
640+
}
641+
642+
ifconstexpr (sizeof(_Ty) <8) {
643+
if (constsize_t _Length =_Byte_length(_First, _Last); _Length >=8) {
644+
_Advance_bytes(_Last, -8);
645+
646+
// Intentional overlapped loads/stores: read both sides first, then write.
647+
constuint8x8_t _Left =vld1_u8(static_cast<uint8_t*>(_First));
648+
constuint8x8_t _Right =vld1_u8(static_cast<uint8_t*>(_Last));
649+
650+
constuint8x8_t _Left_reversed =_Traits::_Rev(_Left);
651+
constuint8x8_t _Right_reversed =_Traits::_Rev(_Right);
652+
653+
vst1_u8(static_cast<uint8_t*>(_First), _Right_reversed);
654+
vst1_u8(static_cast<uint8_t*>(_Last), _Left_reversed);
655+
656+
// Overlapped stores cover any 8-15B remainder, so do not fall through to scalar tail.
657+
return;
658+
}
659+
}
660+
661+
ifconstexpr (sizeof(_Ty) <4) {
662+
_Reverse_tail(static_cast<_Ty*>(_First),static_cast<_Ty*>(_Last));
663+
}
664+
}
665+
666+
template<class_Traits,class_Ty>
667+
__declspec(noalias) void __cdecl _Reverse_copy_impl(
668+
constvoid* _First,constvoid* _Last,void* _Dest)noexcept {
669+
if (constsize_t _Length =_Byte_length(_First, _Last); _Length >=64) {
670+
constvoid* _Stop_at = _Dest;
671+
constexprsize_t _Mask_64 = ~((static_cast<size_t>(1) <<6) -1);
672+
_Advance_bytes(_Stop_at, _Length & _Mask_64);
673+
do {
674+
_Advance_bytes(_Last, -64);
675+
constuint8x16_t _Block1 =vld1q_u8(static_cast<constuint8_t*>(_Last) +0);
676+
constuint8x16_t _Block2 =vld1q_u8(static_cast<constuint8_t*>(_Last) +16);
677+
constuint8x16_t _Block3 =vld1q_u8(static_cast<constuint8_t*>(_Last) +32);
678+
constuint8x16_t _Block4 =vld1q_u8(static_cast<constuint8_t*>(_Last) +48);
679+
680+
constuint8x16_t _Block1_reversed =_Traits::_Rev(_Block1);
681+
constuint8x16_t _Block2_reversed =_Traits::_Rev(_Block2);
682+
constuint8x16_t _Block3_reversed =_Traits::_Rev(_Block3);
683+
constuint8x16_t _Block4_reversed =_Traits::_Rev(_Block4);
684+
685+
vst1q_u8(static_cast<uint8_t*>(_Dest) +0, _Block4_reversed);
686+
vst1q_u8(static_cast<uint8_t*>(_Dest) +16, _Block3_reversed);
687+
vst1q_u8(static_cast<uint8_t*>(_Dest) +32, _Block2_reversed);
688+
vst1q_u8(static_cast<uint8_t*>(_Dest) +48, _Block1_reversed);
689+
_Advance_bytes(_Dest,64);
690+
}while (_Dest != _Stop_at);
691+
}
692+
693+
if (constsize_t _Length =_Byte_length(_First, _Last); _Length >=32) {
694+
_Advance_bytes(_Last, -32);
695+
constuint8x16_t _Block1 =vld1q_u8(static_cast<constuint8_t*>(_Last) +0);
696+
constuint8x16_t _Block2 =vld1q_u8(static_cast<constuint8_t*>(_Last) +16);
697+
698+
constuint8x16_t _Block1_reversed =_Traits::_Rev(_Block1);
699+
constuint8x16_t _Block2_reversed =_Traits::_Rev(_Block2);
700+
701+
vst1q_u8(static_cast<uint8_t*>(_Dest) +0, _Block2_reversed);
702+
vst1q_u8(static_cast<uint8_t*>(_Dest) +16, _Block1_reversed);
703+
_Advance_bytes(_Dest,32);
704+
}
705+
706+
if (constsize_t _Length =_Byte_length(_First, _Last); _Length >=16) {
707+
_Advance_bytes(_Last, -16);
708+
constuint8x16_t _Block =vld1q_u8(static_cast<constuint8_t*>(_Last));
709+
constuint8x16_t _Block_reversed =_Traits::_Rev(_Block);
710+
vst1q_u8(static_cast<uint8_t*>(_Dest), _Block_reversed);
711+
_Advance_bytes(_Dest,16);
712+
}
713+
714+
if (constsize_t _Length =_Byte_length(_First, _Last); _Length >=8) {
715+
_Advance_bytes(_Last, -8);
716+
constuint8x8_t _Block =vld1_u8(static_cast<constuint8_t*>(_Last));
717+
constuint8x8_t _Block_reversed =_Traits::_Rev(_Block);
718+
vst1_u8(static_cast<uint8_t*>(_Dest), _Block_reversed);
719+
_Advance_bytes(_Dest,8);
720+
}
721+
722+
ifconstexpr (sizeof(_Ty) <8) {
723+
_Reverse_copy_tail(
724+
static_cast<const _Ty*>(_First),static_cast<const _Ty*>(_Last),static_cast<_Ty*>(_Dest));
725+
}
726+
}
727+
#else// ^^^ defined(_M_ARM64) / !defined(_M_ARM64) vvv
529728
#ifdef _M_ARM64EC
530729
using _Traits_1 =void;
531730
using _Traits_2 =void;
@@ -586,22 +785,6 @@ namespace {
586785
};
587786
#endif// ^^^ !defined(_M_ARM64EC) ^^^
588787

589-
template<class_BidIt>
590-
void_Reverse_tail(_BidIt _First, _BidIt _Last)noexcept {
591-
for (; _First != _Last && _First != --_Last; ++_First) {
592-
constauto _Temp = *_First;
593-
*_First = *_Last;
594-
*_Last = _Temp;
595-
}
596-
}
597-
598-
template<class_BidIt,class_OutIt>
599-
void_Reverse_copy_tail(const _BidIt _First, _BidIt _Last, _OutIt _Dest)noexcept {
600-
while (_First != _Last) {
601-
*_Dest++ = *--_Last;
602-
}
603-
}
604-
605788
#ifndef _M_ARM64EC
606789
__m256i_Avx2_rev_tail_mask_32(constsize_t _Count_in_bytes)noexcept {
607790
// _Count_in_bytes must be within [0, 32].
@@ -700,6 +883,7 @@ namespace {
700883
_Reverse_copy_tail(
701884
static_cast<const _Ty*>(_First),static_cast<const _Ty*>(_Last),static_cast<_Ty*>(_Dest));
702885
}
886+
#endif// ^^^ !defined(_M_ARM64) ^^^
703887
}// namespace _Reversing
704888
}// unnamed namespace
705889

@@ -743,6 +927,7 @@ __declspec(noalias) void __cdecl __std_reverse_copy_trivially_copyable_8(
743927

744928
}// extern "C"
745929

930+
#ifndef _M_ARM64
746931
namespace {
747932
namespace_Sorting {
748933
enum _Min_max_mode {

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp