@@ -1068,10 +1068,45 @@ namespace xsimd
10681068 }
10691069
10701070// load_masked
1071+ template <class A ,class T ,bool ... Values,class Mode ,class =typename std::enable_if<std::is_integral<T>::value>::type>
1072+ XSIMD_INLINE batch<T, A>load_masked (Tconst * mem, batch_bool_constant<T, A, Values...> mask, Mode, requires_arch<sse2>)noexcept
1073+ {
1074+ XSIMD_IF_CONSTEXPR (mask.mask () ==0x1 )
1075+ {
1076+ XSIMD_IF_CONSTEXPR (sizeof (T) ==2 )
1077+ {
1078+ return mm_loadu_si16 (mem);
1079+ }
1080+ XSIMD_IF_CONSTEXPR (sizeof (T) ==4 )
1081+ {
1082+ return mm_loadu_si32 (mem);
1083+ }
1084+ XSIMD_IF_CONSTEXPR (sizeof (T) ==8 )
1085+ {
1086+ return mm_loadu_si64 (mem);
1087+ }
1088+ }
1089+ else XSIMD_IF_CONSTEXPR (sizeof (T) ==2 && mask.mask () ==0x3 )
1090+ {
1091+ return mm_loadu_si32 (mem);
1092+ }
1093+ else XSIMD_IF_CONSTEXPR (sizeof (T) ==4 && mask.mask () ==0x3 )
1094+ {
1095+ return mm_loadu_si64 (mem);
1096+ }
1097+ else
1098+ {
1099+ return load_masked<A>(mem, mask, convert<T> {}, Mode {}, common {});
1100+ }
1101+ }
10711102template <class A ,bool ... Values,class Mode >
10721103 XSIMD_INLINE batch<float , A>load_masked (float const * mem, batch_bool_constant<float , A, Values...> mask, Mode, requires_arch<sse2>)noexcept
10731104 {
1074- XSIMD_IF_CONSTEXPR (mask.countr_one () ==2 )
1105+ XSIMD_IF_CONSTEXPR (mask.mask () ==0x1 )
1106+ {
1107+ return _mm_load_ss (mem);
1108+ }
1109+ else XSIMD_IF_CONSTEXPR (mask.countr_one () ==2 )
10751110 {
10761111return _mm_loadl_pi (_mm_setzero_ps (),reinterpret_cast <__m64const *>(mem));
10771112 }
@@ -1089,7 +1124,7 @@ namespace xsimd
10891124 {
10901125XSIMD_IF_CONSTEXPR (mask.countr_one () ==1 )
10911126 {
1092- return _mm_move_sd ( _mm_setzero_pd (), _mm_load_sd (mem) );
1127+ return _mm_load_sd (mem);
10931128 }
10941129else XSIMD_IF_CONSTEXPR (mask.countl_one () ==1 )
10951130 {
@@ -1105,7 +1140,11 @@ namespace xsimd
11051140template <class A ,bool ... Values,class Mode >
11061141 XSIMD_INLINEvoid store_masked (float * mem, batch<float , A>const & src, batch_bool_constant<float , A, Values...> mask, Mode, requires_arch<sse2>)noexcept
11071142 {
1108- XSIMD_IF_CONSTEXPR (mask.countr_one () ==2 )
1143+ XSIMD_IF_CONSTEXPR (mask.mask () ==0x1 )
1144+ {
1145+ _mm_store_ss (mem, src);
1146+ }
1147+ else XSIMD_IF_CONSTEXPR (mask.countr_one () ==2 )
11091148 {
11101149_mm_storel_pi (reinterpret_cast <__m64*>(mem), src);
11111150 }