Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit519338a

Browse files
nathan-bossartchiranmoyfRamaKishanMalladi
committed
Optimize popcount functions with ARM SVE intrinsics.
This commit introduces SVE implementations of pg_popcount{32,64}.Unlike the Neon versions, we need an additional configure-timecheck to determine if the compiler supports SVE intrinsics, and weneed a runtime check to determine if the current CPU supports SVEinstructions. Our testing showed that the SVE implementations aremuch faster for larger inputs and are comparable to the statusquo for smaller inputs.Author: "Devanga.Susmitha@fujitsu.com" <Devanga.Susmitha@fujitsu.com>Co-authored-by: "Chiranmoy.Bhattacharya@fujitsu.com" <Chiranmoy.Bhattacharya@fujitsu.com>Co-authored-by: "Malladi, Rama" <ramamalladi@hotmail.com>Reviewed-by: John Naylor <johncnaylorls@gmail.com>Reviewed-by: Kirill Reshke <reshkekirill@gmail.com>Discussion:https://postgr.es/m/010101936e4aaa70-b474ab9e-b9ce-474d-a3ba-a3dc223d295c-000000%40us-west-2.amazonses.comDiscussion:https://postgr.es/m/OSZPR01MB84990A9A02A3515C6E85A65B8B2A2%40OSZPR01MB8499.jpnprd01.prod.outlook.com
1 parent3c8e463 commit519338a

File tree

7 files changed

+475
-6
lines changed

7 files changed

+475
-6
lines changed

‎config/c-compiler.m4‎

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -708,3 +708,55 @@ if test x"$Ac_cachevar" = x"yes"; then
708708
fi
709709
undefine([Ac_cachevar])dnl
710710
])# PGAC_AVX512_POPCNT_INTRINSICS
711+
712+
# PGAC_SVE_POPCNT_INTRINSICS
713+
# --------------------------
714+
# Check if the compiler supports the SVE popcount instructions using the
715+
# svptrue_b64, svdup_u64, svcntb, svld1_u64, svld1_u8, svadd_u64_x,
716+
# svcnt_u64_x, svcnt_u8_x, svaddv_u64, svaddv_u8, svwhilelt_b8_s32,
717+
# svand_n_u64_x, and svand_n_u8_x intrinsic functions.
718+
#
719+
# If the intrinsics are supported, sets pgac_sve_popcnt_intrinsics.
720+
AC_DEFUN([PGAC_SVE_POPCNT_INTRINSICS],
721+
[define([Ac_cachevar],[AS_TR_SH([pgac_cv_sve_popcnt_intrinsics])])dnl
722+
AC_CACHE_CHECK([for svcnt_x],[Ac_cachevar],
723+
[AC_LINK_IFELSE([AC_LANG_PROGRAM([[#include <arm_sve.h>
724+
725+
char buf[128];
726+
727+
#if defined(__has_attribute) && __has_attribute (target)
728+
__attribute__((target("arch=armv8-a+sve")))
729+
#endif
730+
static int popcount_test(void)
731+
{
732+
svbool_tpred = svptrue_b64();
733+
svuint8_tvec8;
734+
svuint64_taccum1 = svdup_u64(0),
735+
accum2 = svdup_u64(0),
736+
vec64;
737+
char *p = buf;
738+
uint64_tpopcnt,
739+
mask = 0x5555555555555555;
740+
741+
vec64 = svand_n_u64_x(pred, svld1_u64(pred, (const uint64_t *) p), mask);
742+
accum1 = svadd_u64_x(pred, accum1, svcnt_u64_x(pred, vec64));
743+
p += svcntb();
744+
745+
vec64 = svand_n_u64_x(pred, svld1_u64(pred, (const uint64_t *) p), mask);
746+
accum2 = svadd_u64_x(pred, accum2, svcnt_u64_x(pred, vec64));
747+
p += svcntb();
748+
749+
popcnt = svaddv_u64(pred, svadd_u64_x(pred, accum1, accum2));
750+
751+
pred = svwhilelt_b8_s32(0, sizeof(buf));
752+
vec8 = svand_n_u8_x(pred, svld1_u8(pred, (const uint8_t *) p), 0x55);
753+
return (int) (popcnt + svaddv_u8(pred, svcnt_u8_x(pred, vec8)));
754+
}]],
755+
[return popcount_test();])],
756+
[Ac_cachevar=yes],
757+
[Ac_cachevar=no])])
758+
if test x"$Ac_cachevar" = x"yes"; then
759+
pgac_sve_popcnt_intrinsics=yes
760+
fi
761+
undefine([Ac_cachevar])dnl
762+
])# PGAC_SVE_POPCNT_INTRINSICS

‎configure‎

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17517,6 +17517,77 @@ $as_echo "#define USE_AVX512_POPCNT_WITH_RUNTIME_CHECK 1" >>confdefs.h
1751717517
fi
1751817518
fi
1751917519

17520+
# Check for SVE popcount intrinsics
17521+
#
17522+
if test x"$host_cpu" = x"aarch64"; then
17523+
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for svcnt_x" >&5
17524+
$as_echo_n "checking for svcnt_x... " >&6; }
17525+
if ${pgac_cv_sve_popcnt_intrinsics+:} false; then :
17526+
$as_echo_n "(cached) " >&6
17527+
else
17528+
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
17529+
/* end confdefs.h. */
17530+
#include <arm_sve.h>
17531+
17532+
char buf[128];
17533+
17534+
#if defined(__has_attribute) && __has_attribute (target)
17535+
__attribute__((target("arch=armv8-a+sve")))
17536+
#endif
17537+
static int popcount_test(void)
17538+
{
17539+
svbool_tpred = svptrue_b64();
17540+
svuint8_tvec8;
17541+
svuint64_taccum1 = svdup_u64(0),
17542+
accum2 = svdup_u64(0),
17543+
vec64;
17544+
char *p = buf;
17545+
uint64_tpopcnt,
17546+
mask = 0x5555555555555555;
17547+
17548+
vec64 = svand_n_u64_x(pred, svld1_u64(pred, (const uint64_t *) p), mask);
17549+
accum1 = svadd_u64_x(pred, accum1, svcnt_u64_x(pred, vec64));
17550+
p += svcntb();
17551+
17552+
vec64 = svand_n_u64_x(pred, svld1_u64(pred, (const uint64_t *) p), mask);
17553+
accum2 = svadd_u64_x(pred, accum2, svcnt_u64_x(pred, vec64));
17554+
p += svcntb();
17555+
17556+
popcnt = svaddv_u64(pred, svadd_u64_x(pred, accum1, accum2));
17557+
17558+
pred = svwhilelt_b8_s32(0, sizeof(buf));
17559+
vec8 = svand_n_u8_x(pred, svld1_u8(pred, (const uint8_t *) p), 0x55);
17560+
return (int) (popcnt + svaddv_u8(pred, svcnt_u8_x(pred, vec8)));
17561+
}
17562+
int
17563+
main ()
17564+
{
17565+
return popcount_test();
17566+
;
17567+
return 0;
17568+
}
17569+
_ACEOF
17570+
if ac_fn_c_try_link "$LINENO"; then :
17571+
pgac_cv_sve_popcnt_intrinsics=yes
17572+
else
17573+
pgac_cv_sve_popcnt_intrinsics=no
17574+
fi
17575+
rm -f core conftest.err conftest.$ac_objext \
17576+
conftest$ac_exeext conftest.$ac_ext
17577+
fi
17578+
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sve_popcnt_intrinsics" >&5
17579+
$as_echo "$pgac_cv_sve_popcnt_intrinsics" >&6; }
17580+
if test x"$pgac_cv_sve_popcnt_intrinsics" = x"yes"; then
17581+
pgac_sve_popcnt_intrinsics=yes
17582+
fi
17583+
17584+
if test x"$pgac_sve_popcnt_intrinsics" = x"yes"; then
17585+
17586+
$as_echo "#define USE_SVE_POPCNT_WITH_RUNTIME_CHECK 1" >>confdefs.h
17587+
17588+
fi
17589+
fi
17590+
1752017591
# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
1752117592
#
1752217593
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32" >&5

‎configure.ac‎

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2070,6 +2070,15 @@ if test x"$host_cpu" = x"x86_64"; then
20702070
fi
20712071
fi
20722072

2073+
# Check for SVE popcount intrinsics
2074+
#
2075+
if test x"$host_cpu" = x"aarch64"; then
2076+
PGAC_SVE_POPCNT_INTRINSICS()
2077+
if test x"$pgac_sve_popcnt_intrinsics" = x"yes"; then
2078+
AC_DEFINE(USE_SVE_POPCNT_WITH_RUNTIME_CHECK,1,[Define to 1 to use SVE popcount instructions with a runtime check.])
2079+
fi
2080+
fi
2081+
20732082
# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
20742083
#
20752084
PGAC_SSE42_CRC32_INTRINSICS()

‎meson.build‎

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2297,6 +2297,54 @@ int main(void)
22972297
endif
22982298

22992299

2300+
###############################################################
2301+
# Check for the availability of SVE popcount intrinsics.
2302+
###############################################################
2303+
2304+
if host_cpu=='aarch64'
2305+
2306+
prog='''
2307+
#include <arm_sve.h>
2308+
2309+
char buf[128];
2310+
2311+
#if defined(__has_attribute) && __has_attribute (target)
2312+
__attribute__((target("arch=armv8-a+sve")))
2313+
#endif
2314+
int main(void)
2315+
{
2316+
svbool_tpred = svptrue_b64();
2317+
svuint8_tvec8;
2318+
svuint64_taccum1 = svdup_u64(0),
2319+
accum2 = svdup_u64(0),
2320+
vec64;
2321+
char *p = buf;
2322+
uint64_tpopcnt,
2323+
mask = 0x5555555555555555;
2324+
2325+
vec64 = svand_n_u64_x(pred, svld1_u64(pred, (const uint64_t *) p), mask);
2326+
accum1 = svadd_u64_x(pred, accum1, svcnt_u64_x(pred, vec64));
2327+
p += svcntb();
2328+
2329+
vec64 = svand_n_u64_x(pred, svld1_u64(pred, (const uint64_t *) p), mask);
2330+
accum2 = svadd_u64_x(pred, accum2, svcnt_u64_x(pred, vec64));
2331+
p += svcntb();
2332+
2333+
popcnt = svaddv_u64(pred, svadd_u64_x(pred, accum1, accum2));
2334+
2335+
pred = svwhilelt_b8_s32(0, sizeof(buf));
2336+
vec8 = svand_n_u8_x(pred, svld1_u8(pred, (const uint8_t *) p), 0x55);
2337+
return (int) (popcnt + svaddv_u8(pred, svcnt_u8_x(pred, vec8)));
2338+
}
2339+
'''
2340+
2341+
if cc.links(prog,name:'SVE popcount',args: test_c_args)
2342+
cdata.set('USE_SVE_POPCNT_WITH_RUNTIME_CHECK',1)
2343+
endif
2344+
2345+
endif
2346+
2347+
23002348
###############################################################
23012349
# Select CRC-32C implementation.
23022350
#

‎src/include/pg_config.h.in‎

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -712,6 +712,9 @@
712712
/* Define to 1 to use Intel SSE 4.2 CRC instructions with a runtime check. */
713713
#undef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK
714714

715+
/* Define to 1 to use SVE popcount instructions with a runtime check. */
716+
#undef USE_SVE_POPCNT_WITH_RUNTIME_CHECK
717+
715718
/* Define to build with systemd support. (--with-systemd) */
716719
#undef USE_SYSTEMD
717720

‎src/include/port/pg_bitutils.h‎

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -324,6 +324,23 @@ extern uint64 pg_popcount_avx512(const char *buf, int bytes);
324324
externuint64pg_popcount_masked_avx512(constchar*buf,intbytes,bits8mask);
325325
#endif
326326

327+
#elifPOPCNT_AARCH64
328+
/* Use the Neon version of pg_popcount{32,64} without function pointer. */
329+
externintpg_popcount32(uint32word);
330+
externintpg_popcount64(uint64word);
331+
332+
/*
333+
* We can try to use an SVE-optimized pg_popcount() on some systems For that,
334+
* we do use a function pointer.
335+
*/
336+
#ifdefUSE_SVE_POPCNT_WITH_RUNTIME_CHECK
337+
externPGDLLIMPORTuint64 (*pg_popcount_optimized) (constchar*buf,intbytes);
338+
externPGDLLIMPORTuint64 (*pg_popcount_masked_optimized) (constchar*buf,intbytes,bits8mask);
339+
#else
340+
externuint64pg_popcount_optimized(constchar*buf,intbytes);
341+
externuint64pg_popcount_masked_optimized(constchar*buf,intbytes,bits8mask);
342+
#endif
343+
327344
#else
328345
/* Use a portable implementation -- no need for a function pointer. */
329346
externintpg_popcount32(uint32word);

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp