Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit121d2d3

Browse files
committed
Use SSE2 in is_valid_ascii() where available.
Per flame graph from Jelte Fennema, COPY FROM ... USING BINARY showsinput validation taking at least 5% of the profile, so it's worth tryingto be more efficient here. With this change, validation of pure ASCII isnearly 40% faster on contemporary Intel hardware. To make this changelegible and easier to adopt to additional architectures, use helperfunctions to abstract the platform details away.Reviewed by Nathan BossartDiscussion:https://www.postgresql.org/message-id/CAFBsxsG%3Dk8t%3DC457FXnoBXb%3D8iA4OaZkbFogFMachWif7mNnww%40mail.gmail.com
1 parentab97178 commit121d2d3

File tree

3 files changed

+86
-14
lines changed

3 files changed

+86
-14
lines changed

‎src/common/wchar.c

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1919,10 +1919,11 @@ pg_utf8_verifystr(const unsigned char *s, int len)
19191919
uint32state=BGN;
19201920

19211921
/*
1922-
* Sixteen seems to give the best balance of performance across different
1923-
* byte distributions.
1922+
* With a stride of two vector widths, gcc will unroll the loop. Even if
1923+
* the compiler can unroll a longer loop, it's not worth it because we
1924+
* must fall back to the byte-wise algorithm if we find any non-ASCII.
19241925
*/
1925-
#defineSTRIDE_LENGTH16
1926+
#defineSTRIDE_LENGTH(2 * sizeof(Vector8))
19261927

19271928
if (len >=STRIDE_LENGTH)
19281929
{

‎src/include/mb/pg_wchar.h

Lines changed: 27 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
#ifndefPG_WCHAR_H
2020
#definePG_WCHAR_H
2121

22+
#include"port/simd.h"
23+
2224
/*
2325
* The pg_wchar type
2426
*/
@@ -704,25 +706,28 @@ extern WCHAR *pgwin32_message_to_UTF16(const char *str, int len, int *utf16len);
704706
* Verify a chunk of bytes for valid ASCII.
705707
*
706708
* Returns false if the input contains any zero bytes or bytes with the
707-
* high-bit set. Input len must be a multiple of8.
709+
* high-bit set. Input len must be a multiple ofthe chunk size (8 or 16).
708710
*/
709711
staticinlinebool
710712
is_valid_ascii(constunsignedchar*s,intlen)
711713
{
712714
constunsignedchar*consts_end=s+len;
713-
uint64chunk,
714-
highbit_cum=UINT64CONST(0),
715-
zero_cum=UINT64CONST(0x8080808080808080);
715+
Vector8chunk;
716+
Vector8highbit_cum=vector8_broadcast(0);
717+
#ifdefUSE_NO_SIMD
718+
Vector8zero_cum=vector8_broadcast(0x80);
719+
#endif
716720

717721
Assert(len %sizeof(chunk)==0);
718722

719723
while (s<s_end)
720724
{
721-
memcpy(&chunk,s,sizeof(chunk));
725+
vector8_load(&chunk,s);
726+
727+
/* Capture any zero bytes in this chunk. */
728+
#ifdefUSE_NO_SIMD
722729

723730
/*
724-
* Capture any zero bytes in this chunk.
725-
*
726731
* First, add 0x7f to each byte. This sets the high bit in each byte,
727732
* unless it was a zero. If any resulting high bits are zero, the
728733
* corresponding high bits in the zero accumulator will be cleared.
@@ -733,21 +738,32 @@ is_valid_ascii(const unsigned char *s, int len)
733738
* any input bytes did have the high bit set, it doesn't matter
734739
* because we check for those separately.
735740
*/
736-
zero_cum &= (chunk+UINT64CONST(0x7f7f7f7f7f7f7f7f));
741+
zero_cum &= (chunk+vector8_broadcast(0x7F));
742+
#else
743+
744+
/*
745+
* Set all bits in each lane of the highbit accumulator where input
746+
* bytes are zero.
747+
*/
748+
highbit_cum=vector8_or(highbit_cum,
749+
vector8_eq(chunk,vector8_broadcast(0)));
750+
#endif
737751

738752
/* Capture all set bits in this chunk. */
739-
highbit_cum|=chunk;
753+
highbit_cum=vector8_or(highbit_cum,chunk);
740754

741755
s+=sizeof(chunk);
742756
}
743757

744758
/* Check if any high bits in the high bit accumulator got set. */
745-
if (highbit_cum&UINT64CONST(0x8080808080808080))
759+
if (vector8_is_highbit_set(highbit_cum))
746760
return false;
747761

762+
#ifdefUSE_NO_SIMD
748763
/* Check if any high bits in the zero accumulator got cleared. */
749-
if (zero_cum!=UINT64CONST(0x8080808080808080))
764+
if (zero_cum!=vector8_broadcast(0x80))
750765
return false;
766+
#endif
751767

752768
return true;
753769
}

‎src/include/port/simd.h

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,18 @@ static inline Vector8 vector8_broadcast(const uint8 c);
5252
staticinlineboolvector8_has(constVector8v,constuint8c);
5353
staticinlineboolvector8_has_zero(constVector8v);
5454
staticinlineboolvector8_has_le(constVector8v,constuint8c);
55+
staticinlineboolvector8_is_highbit_set(constVector8v);
5556

57+
/* arithmetic operations */
58+
staticinlineVector8vector8_or(constVector8v1,constVector8v2);
59+
60+
/* Different semantics for SIMD architectures. */
61+
#ifndefUSE_NO_SIMD
62+
63+
/* comparisons between vectors */
64+
staticinlineVector8vector8_eq(constVector8v1,constVector8v2);
65+
66+
#endif/* ! USE_NO_SIMD */
5667

5768
/*
5869
* Load a chunk of memory into the given vector.
@@ -193,4 +204,48 @@ vector8_has_le(const Vector8 v, const uint8 c)
193204
returnresult;
194205
}
195206

207+
/*
208+
* Return true if the high bit of any element is set
209+
*/
210+
staticinlinebool
211+
vector8_is_highbit_set(constVector8v)
212+
{
213+
#ifdefUSE_SSE2
214+
return_mm_movemask_epi8(v)!=0;
215+
#else
216+
returnv&vector8_broadcast(0x80);
217+
#endif
218+
}
219+
220+
/*
221+
* Return the bitwise OR of the inputs
222+
*/
223+
staticinlineVector8
224+
vector8_or(constVector8v1,constVector8v2)
225+
{
226+
#ifdefUSE_SSE2
227+
return_mm_or_si128(v1,v2);
228+
#else
229+
returnv1 |v2;
230+
#endif
231+
}
232+
233+
234+
/* Different semantics for SIMD architectures. */
235+
#ifndefUSE_NO_SIMD
236+
237+
/*
238+
* Return a vector with all bits set in each lane where the the corresponding
239+
* lanes in the inputs are equal.
240+
*/
241+
staticinlineVector8
242+
vector8_eq(constVector8v1,constVector8v2)
243+
{
244+
#ifdefUSE_SSE2
245+
return_mm_cmpeq_epi8(v1,v2);
246+
#endif
247+
}
248+
249+
#endif/* ! USE_NO_SIMD */
250+
196251
#endif/* SIMD_H */

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp