Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit6280185

Browse files
Improve masked load/store for sse2 when only the first element is selected
1 parent23faec4 commit6280185

File tree

1 file changed

+42
-3
lines changed

1 file changed

+42
-3
lines changed

‎include/xsimd/arch/xsimd_sse2.hpp‎

Lines changed: 42 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1068,10 +1068,45 @@ namespace xsimd
10681068
}
10691069

10701070
// load_masked
1071+
template<classA,classT,bool... Values,classMode,class =typename std::enable_if<std::is_integral<T>::value>::type>
1072+
XSIMD_INLINE batch<T, A>load_masked(Tconst* mem, batch_bool_constant<T, A, Values...> mask, Mode, requires_arch<sse2>)noexcept
1073+
{
1074+
XSIMD_IF_CONSTEXPR(mask.mask() ==0x1)
1075+
{
1076+
XSIMD_IF_CONSTEXPR(sizeof(T) ==2)
1077+
{
1078+
returnmm_loadu_si16(mem);
1079+
}
1080+
XSIMD_IF_CONSTEXPR(sizeof(T) ==4)
1081+
{
1082+
returnmm_loadu_si32(mem);
1083+
}
1084+
XSIMD_IF_CONSTEXPR(sizeof(T) ==8)
1085+
{
1086+
returnmm_loadu_si64(mem);
1087+
}
1088+
}
1089+
elseXSIMD_IF_CONSTEXPR(sizeof(T) ==2 && mask.mask() ==0x3)
1090+
{
1091+
returnmm_loadu_si32(mem);
1092+
}
1093+
elseXSIMD_IF_CONSTEXPR(sizeof(T) ==4 && mask.mask() ==0x3)
1094+
{
1095+
returnmm_loadu_si64(mem);
1096+
}
1097+
else
1098+
{
1099+
return load_masked<A>(mem, mask, convert<T> {}, Mode {}, common {});
1100+
}
1101+
}
10711102
template<classA,bool... Values,classMode>
10721103
XSIMD_INLINE batch<float, A>load_masked(floatconst* mem, batch_bool_constant<float, A, Values...> mask, Mode, requires_arch<sse2>)noexcept
10731104
{
1074-
XSIMD_IF_CONSTEXPR(mask.countr_one() ==2)
1105+
XSIMD_IF_CONSTEXPR(mask.mask() ==0x1)
1106+
{
1107+
return_mm_load_ss(mem);
1108+
}
1109+
elseXSIMD_IF_CONSTEXPR(mask.countr_one() ==2)
10751110
{
10761111
return_mm_loadl_pi(_mm_setzero_ps(),reinterpret_cast<__m64const*>(mem));
10771112
}
@@ -1089,7 +1124,7 @@ namespace xsimd
10891124
{
10901125
XSIMD_IF_CONSTEXPR(mask.countr_one() ==1)
10911126
{
1092-
return_mm_move_sd(_mm_setzero_pd(),_mm_load_sd(mem));
1127+
return_mm_load_sd(mem);
10931128
}
10941129
elseXSIMD_IF_CONSTEXPR(mask.countl_one() ==1)
10951130
{
@@ -1105,7 +1140,11 @@ namespace xsimd
11051140
template<classA,bool... Values,classMode>
11061141
XSIMD_INLINEvoidstore_masked(float* mem, batch<float, A>const& src, batch_bool_constant<float, A, Values...> mask, Mode, requires_arch<sse2>)noexcept
11071142
{
1108-
XSIMD_IF_CONSTEXPR(mask.countr_one() ==2)
1143+
XSIMD_IF_CONSTEXPR(mask.mask() ==0x1)
1144+
{
1145+
_mm_store_ss(mem, src);
1146+
}
1147+
elseXSIMD_IF_CONSTEXPR(mask.countr_one() ==2)
11091148
{
11101149
_mm_storel_pi(reinterpret_cast<__m64*>(mem), src);
11111150
}

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp