Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit936546d

Browse files
committed
Optimize pg_comp_crc32c_sse42 routine slightly, and also use it on x86.
Eliminate the separate 'len' variable from the loops, and also use the 4byte instruction. This shaves off a few more cycles. Even though thisroutine that uses the special SSE 4.2 instructions is much faster than ageneric routine, it's still a hot spot, so let's make it as fast aspossible.Change the configure test to not test _mm_crc32_u64. That variant is onlyavailable in the 64-bit x86-64 architecture, not in 32-bit x86. Modifypg_comp_crc32c_sse42 so that it only uses _mm_crc32_u64 on x86-64. Withthese changes, the SSE accelerated CRC-32C implementation can also be usedon 32-bit x86 systems.This also fixes the 32-bit MSVC build.
1 parentb73e7a0 commit936546d

File tree

3 files changed

+41
-22
lines changed

3 files changed

+41
-22
lines changed

‎config/c-compiler.m4

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -476,20 +476,24 @@ fi])# PGAC_HAVE_GCC__ATOMIC_INT64_CAS
476476

477477
# PGAC_SSE42_CRC32_INTRINSICS
478478
# -----------------------
479-
# Check if the compiler supports _mm_crc32_u8 and _mm_crc32_u64 intrinsics.
479+
# Check if the compiler supports the x86 CRC instructions added in SSE 4.2,
480+
# using the _mm_crc32_u8 and _mm_crc32_u32 intrinsic functions. (We don't
481+
# test the 8-byte variant, _mm_crc32_u64, but it is assumed to be present if
482+
# the other ones are, on x86-64 platforms)
483+
#
480484
# An optional compiler flag can be passed as argument (e.g. -msse4.2). If the
481485
# intrinsics are supported, sets pgac_sse42_crc32_intrinsics, and CFLAGS_SSE42.
482486
AC_DEFUN([PGAC_SSE42_CRC32_INTRINSICS],
483487
[define([Ac_cachevar],[AS_TR_SH([pgac_cv_sse42_crc32_intrinsics_$1])])dnl
484-
AC_CACHE_CHECK([for _mm_crc32_u8 and_mm_crc32_u64 with CFLAGS=$1],[Ac_cachevar],
488+
AC_CACHE_CHECK([for _mm_crc32_u8 and_mm_crc32_u32 with CFLAGS=$1],[Ac_cachevar],
485489
[pgac_save_CFLAGS=$CFLAGS
486490
CFLAGS="$pgac_save_CFLAGS$1"
487491
ac_save_c_werror_flag=$ac_c_werror_flag
488492
ac_c_werror_flag=yes
489493
AC_TRY_LINK([#include <nmmintrin.h>],
490494
[unsigned int crc = 0;
491495
crc = _mm_crc32_u8(crc, 0);
492-
crc =(unsigned int) _mm_crc32_u64(crc, 0);],
496+
crc =_mm_crc32_u32(crc, 0);],
493497
[Ac_cachevar=yes],
494498
[Ac_cachevar=no])
495499
ac_c_werror_flag=$ac_save_c_werror_flag

‎configure

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14172,8 +14172,8 @@ fi
1417214172
# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
1417314173
# with the default compiler flags. If not, check if adding the -msse4.2
1417414174
# flag helps. CFLAGS_SSE42 is set to -msse4.2 if that's required.
14175-
{$as_echo"$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and_mm_crc32_u64 with CFLAGS=">&5
14176-
$as_echo_n"checking for _mm_crc32_u8 and_mm_crc32_u64 with CFLAGS=...">&6; }
14175+
{$as_echo"$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and_mm_crc32_u32 with CFLAGS=">&5
14176+
$as_echo_n"checking for _mm_crc32_u8 and_mm_crc32_u32 with CFLAGS=...">&6; }
1417714177
if${pgac_cv_sse42_crc32_intrinsics_+:}false;then:
1417814178
$as_echo_n"(cached)">&6
1417914179
else
@@ -14189,7 +14189,7 @@ main ()
1418914189
{
1419014190
unsigned int crc = 0;
1419114191
crc = _mm_crc32_u8(crc, 0);
14192-
crc =(unsigned int) _mm_crc32_u64(crc, 0);
14192+
crc =_mm_crc32_u32(crc, 0);
1419314193
;
1419414194
return 0;
1419514195
}
@@ -14212,8 +14212,8 @@ if test x"$pgac_cv_sse42_crc32_intrinsics_" = x"yes"; then
1421214212
fi
1421314213

1421414214
iftest x"$pgac_sse42_crc32_intrinsics"!= x"yes";then
14215-
{$as_echo"$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and_mm_crc32_u64 with CFLAGS=-msse4.2">&5
14216-
$as_echo_n"checking for _mm_crc32_u8 and_mm_crc32_u64 with CFLAGS=-msse4.2...">&6; }
14215+
{$as_echo"$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and_mm_crc32_u32 with CFLAGS=-msse4.2">&5
14216+
$as_echo_n"checking for _mm_crc32_u8 and_mm_crc32_u32 with CFLAGS=-msse4.2...">&6; }
1421714217
if${pgac_cv_sse42_crc32_intrinsics__msse4_2+:}false;then:
1421814218
$as_echo_n"(cached)">&6
1421914219
else
@@ -14229,7 +14229,7 @@ main ()
1422914229
{
1423014230
unsigned int crc = 0;
1423114231
crc = _mm_crc32_u8(crc, 0);
14232-
crc =(unsigned int) _mm_crc32_u64(crc, 0);
14232+
crc =_mm_crc32_u32(crc, 0);
1423314233
;
1423414234
return 0;
1423514235
}

‎src/port/pg_crc32c_sse42.c

Lines changed: 28 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -22,30 +22,45 @@ pg_crc32c
2222
pg_comp_crc32c_sse42(pg_crc32ccrc,constvoid*data,size_tlen)
2323
{
2424
constunsignedchar*p=data;
25-
constuint64*p8;
25+
constunsignedchar*pend=p+len;
2626

2727
/*
2828
* Process eight bytes of data at a time.
2929
*
30-
* NB: We do unaligned8-byteaccesses here. The Intel architecture
31-
*allows that,and performance testing didn't show any performance
32-
*gain from aligningthebeginning address.
30+
* NB: We do unaligned accesses here. The Intel architecture allows that,
31+
* and performance testing didn't show any performance gain from aligning
32+
* thebegin address.
3333
*/
34-
p8= (constuint64*)p;
35-
while (len >=8)
34+
#ifdef__x86_64__
35+
while (p+8 <=pend)
3636
{
37-
crc= (uint32)_mm_crc32_u64(crc,*p8++);
38-
len-=8;
37+
crc= (uint32)_mm_crc32_u64(crc,*((constuint64*)p));
38+
p+=8;
3939
}
4040

41+
/* Process remaining full four bytes if any */
42+
if (p+4 <=pend)
43+
{
44+
crc=_mm_crc32_u32(crc,*((constunsignedint*)p));
45+
p+=4;
46+
}
47+
#else
4148
/*
42-
* Handle any remaining bytes one at a time.
49+
* Process four bytes at a time. (The eight byte instruction is not
50+
* available on the 32-bit x86 architecture).
4351
*/
44-
p= (constunsignedchar*)p8;
45-
while (len>0)
52+
while (p+4 <=pend)
53+
{
54+
crc=_mm_crc32_u32(crc,*((constunsignedint*)p));
55+
p+=4;
56+
}
57+
#endif/* __x86_64__ */
58+
59+
/* Process any remaining bytes one at a time. */
60+
while (p<pend)
4661
{
47-
crc=_mm_crc32_u8(crc,*p++);
48-
len--;
62+
crc=_mm_crc32_u8(crc,*p);
63+
p++;
4964
}
5065

5166
returncrc;

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp