NotificationsYou must be signed in to change notification settings
Fork5.2k
Star19k

Commit6be53c2

committed

Optimize popcount functions with ARM Neon intrinsics.

This commit introduces Neon implementations of pg_popcount{32,64},pg_popcount(), and pg_popcount_masked(). As in simd.h, we assumethat all available AArch64 hardware supports Neon, so we don't needany new configure-time or runtime checks. Some compilers alreadyemit Neon instructions for these functions, but our hand-rolledimplementations for pg_popcount() and pg_popcount_masked()performed better in testing, likely due to better instruction-levelparallelism.Author: "Chiranmoy.Bhattacharya@fujitsu.com" <Chiranmoy.Bhattacharya@fujitsu.com>Reviewed-by: John Naylor <johncnaylorls@gmail.com>Discussion:https://postgr.es/m/010101936e4aaa70-b474ab9e-b9ce-474d-a3ba-a3dc223d295c-000000%40us-west-2.amazonses.com

1 parent51a0382 commit6be53c2Copy full SHA for 6be53c2

File tree

5 files changed

+235

-6

lines changed

src
- include/port
  - pg_bitutils.h
- port

5 files changed

+235

-6

lines changed

`‎src/include/port/pg_bitutils.h‎`

Lines changed: 9 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -298,6 +298,15 @@ pg_ceil_log2_64(uint64 num)`
`298`	`298`	`#endif`
`299`	`299`	`#endif`
`300`	`300`
	`301`	`+/*`
	`302`	`+ * On AArch64, we can use Neon instructions if the compiler provides access to`
	`303`	`+ * them (as indicated by __ARM_NEON). As in simd.h, we assume that all`
	`304`	`+ * available 64-bit hardware has Neon support.`
	`305`	`+ */`
	`306`	`+#if defined(__aarch64__)&& defined(__ARM_NEON)`
	`307`	`+#definePOPCNT_AARCH64 1`
	`308`	`+#endif`
	`309`	`+`
`301`	`310`	`#ifdefTRY_POPCNT_X86_64`
`302`	`311`	`/* Attempt to use the POPCNT instruction, but perform a runtime check first */`
`303`	`312`	`externPGDLLIMPORTint (*pg_popcount32) (uint32word);`

`‎src/port/Makefile‎`

Lines changed: 1 addition & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -46,6 +46,7 @@ OBJS = \`
`46`	`46`	`path.o\`
`47`	`47`	`pg_bitutils.o\`
`48`	`48`	`pg_localeconv_r.o\`
	`49`	`+pg_popcount_aarch64.o\`
`49`	`50`	`pg_popcount_avx512.o\`
`50`	`51`	`pg_strong_random.o\`
`51`	`52`	`pgcheckdir.o\`

`‎src/port/meson.build‎`

Lines changed: 1 addition & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -9,6 +9,7 @@ pgport_sources = [`
`9`	`9`	`'path.c',`
`10`	`10`	`'pg_bitutils.c',`
`11`	`11`	`'pg_localeconv_r.c',`
	`12`	`+'pg_popcount_aarch64.c',`
`12`	`13`	`'pg_popcount_avx512.c',`
`13`	`14`	`'pg_strong_random.c',`
`14`	`15`	`'pgcheckdir.c',`

`‎src/port/pg_bitutils.c‎`

Lines changed: 16 additions & 6 deletions

Original file line number	Diff line number	Diff line change
`@@ -103,10 +103,15 @@ const uint8 pg_number_of_ones[256] = {`
`103`	`103`	`4,5,5,6,5,6,6,7,5,6,6,7,6,7,7,8`
`104`	`104`	`};`
`105`	`105`
	`106`	`+/*`
	`107`	`+ * If we are building the Neon versions, we don't need the "slow" fallbacks.`
	`108`	`+ */`
	`109`	`+#ifndefPOPCNT_AARCH64`
`106`	`110`	`staticinlineintpg_popcount32_slow(uint32word);`
`107`	`111`	`staticinlineintpg_popcount64_slow(uint64word);`
`108`	`112`	`staticuint64pg_popcount_slow(constchar*buf,intbytes);`
`109`	`113`	`staticuint64pg_popcount_masked_slow(constchar*buf,intbytes,bits8mask);`
	`114`	`+#endif`
`110`	`115`
`111`	`116`	`#ifdefTRY_POPCNT_X86_64`
`112`	`117`	`staticboolpg_popcount_available(void);`
`@@ -339,6 +344,10 @@ pg_popcount_masked_fast(const char *buf, int bytes, bits8 mask)`
`339`	`344`
`340`	`345`	`#endif/* TRY_POPCNT_X86_64 */`
`341`	`346`
	`347`	`+/*`
	`348`	`+ * If we are building the Neon versions, we don't need the "slow" fallbacks.`
	`349`	`+ */`
	`350`	`+#ifndefPOPCNT_AARCH64`
`342`	`351`
`343`	`352`	`/*`
`344`	`353`	`* pg_popcount32_slow`
`@@ -486,14 +495,15 @@ pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask)`
`486`	`495`	`returnpopcnt;`
`487`	`496`	`}`
`488`	`497`
`489`		`-#ifndefTRY_POPCNT_X86_64`
	`498`	`+#endif/* ! POPCNT_AARCH64 */`
	`499`	`+`
	`500`	`+#if !defined(TRY_POPCNT_X86_64)&& !defined(POPCNT_AARCH64)`
`490`	`501`
`491`	`502`	`/*`
`492`		`- * Whenthe POPCNT instruction is not available, there's no point in using`
	`503`	`+ * Whenspecial CPU instructions are not available, there's no point in using`
`493`	`504`	`* function pointers to vary the implementation between the fast and slow`
`494`		`- * method. We instead just make these actual external functions when`
`495`		`- * TRY_POPCNT_X86_64 is not defined. The compiler should be able to inline`
`496`		`- * the slow versions here.`
	`505`	`+ * method. We instead just make these actual external functions. The compiler`
	`506`	`+ * should be able to inline the slow versions here.`
`497`	`507`	`*/`
`498`	`508`	`int`
`499`	`509`	`pg_popcount32(uint32word)`
`@@ -527,4 +537,4 @@ pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask)`
`527`	`537`	`returnpg_popcount_masked_slow(buf,bytes,mask);`
`528`	`538`	`}`
`529`	`539`
`530`		`-#endif/* !TRY_POPCNT_X86_64 */`
	`540`	`+#endif/* !TRY_POPCNT_X86_64 && ! POPCNT_AARCH64 */`

`‎src/port/pg_popcount_aarch64.c‎`

Lines changed: 208 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,208 @@`
	`1`	`+/*-------------------------------------------------------------------------`
	`2`	`+ *`
	`3`	`+ * pg_popcount_aarch64.c`
	`4`	`+ * Holds the AArch64 popcount implementations.`
	`5`	`+ *`
	`6`	`+ * Copyright (c) 2025, PostgreSQL Global Development Group`
	`7`	`+ *`
	`8`	`+ * IDENTIFICATION`
	`9`	`+ * src/port/pg_popcount_aarch64.c`
	`10`	`+ *`
	`11`	`+ *-------------------------------------------------------------------------`
	`12`	`+ */`
	`13`	`+#include"c.h"`
	`14`	`+`
	`15`	`+#include"port/pg_bitutils.h"`
	`16`	`+`
	`17`	`+#ifdefPOPCNT_AARCH64`
	`18`	`+`
	`19`	`+#include<arm_neon.h>`
	`20`	`+`
	`21`	`+/*`
	`22`	`+ * pg_popcount32`
	`23`	`+ *Return number of 1 bits in word`
	`24`	`+ */`
	`25`	`+int`
	`26`	`+pg_popcount32(uint32word)`
	`27`	`+{`
	`28`	`+returnpg_popcount64((uint64)word);`
	`29`	`+}`
	`30`	`+`
	`31`	`+/*`
	`32`	`+ * pg_popcount64`
	`33`	`+ *Return number of 1 bits in word`
	`34`	`+ */`
	`35`	`+int`
	`36`	`+pg_popcount64(uint64word)`
	`37`	`+{`
	`38`	`+/*`
	`39`	`+ * For some compilers, __builtin_popcountl() already emits Neon`
	`40`	`+ * instructions. The line below should compile to the same code on those`
	`41`	`+ * systems.`
	`42`	`+ */`
	`43`	`+returnvaddv_u8(vcnt_u8(vld1_u8((constuint8*)&word)));`
	`44`	`+}`
	`45`	`+`
	`46`	`+/*`
	`47`	`+ * pg_popcount_optimized`
	`48`	`+ *Returns number of 1 bits in buf`
	`49`	`+ */`
	`50`	`+uint64`
	`51`	`+pg_popcount_optimized(constchar*buf,intbytes)`
	`52`	`+{`
	`53`	`+uint8x16_tvec;`
	`54`	`+uint64x2_taccum1=vdupq_n_u64(0),`
	`55`	`+accum2=vdupq_n_u64(0),`
	`56`	`+accum3=vdupq_n_u64(0),`
	`57`	`+accum4=vdupq_n_u64(0);`
	`58`	`+uint32bytes_per_iteration=4*sizeof(uint8x16_t);`
	`59`	`+uint64popcnt=0;`
	`60`	`+`
	`61`	`+/*`
	`62`	`+ * For better instruction-level parallelism, each loop iteration operates`
	`63`	`+ * on a block of four registers.`
	`64`	`+ */`
	`65`	`+for (;bytes >=bytes_per_iteration;bytes-=bytes_per_iteration)`
	`66`	`+{`
	`67`	`+vec=vld1q_u8((constuint8*)buf);`
	`68`	`+accum1=vpadalq_u32(accum1,vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));`
	`69`	`+buf+=sizeof(uint8x16_t);`
	`70`	`+`
	`71`	`+vec=vld1q_u8((constuint8*)buf);`
	`72`	`+accum2=vpadalq_u32(accum2,vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));`
	`73`	`+buf+=sizeof(uint8x16_t);`
	`74`	`+`
	`75`	`+vec=vld1q_u8((constuint8*)buf);`
	`76`	`+accum3=vpadalq_u32(accum3,vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));`
	`77`	`+buf+=sizeof(uint8x16_t);`
	`78`	`+`
	`79`	`+vec=vld1q_u8((constuint8*)buf);`
	`80`	`+accum4=vpadalq_u32(accum4,vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));`
	`81`	`+buf+=sizeof(uint8x16_t);`
	`82`	`+}`
	`83`	`+`
	`84`	`+/*`
	`85`	`+ * If enough data remains, do another iteration on a block of two`
	`86`	`+ * registers.`
	`87`	`+ */`
	`88`	`+bytes_per_iteration=2*sizeof(uint8x16_t);`
	`89`	`+if (bytes >=bytes_per_iteration)`
	`90`	`+{`
	`91`	`+vec=vld1q_u8((constuint8*)buf);`
	`92`	`+accum1=vpadalq_u32(accum1,vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));`
	`93`	`+buf+=sizeof(uint8x16_t);`
	`94`	`+`
	`95`	`+vec=vld1q_u8((constuint8*)buf);`
	`96`	`+accum2=vpadalq_u32(accum2,vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));`
	`97`	`+buf+=sizeof(uint8x16_t);`
	`98`	`+`
	`99`	`+bytes-=bytes_per_iteration;`
	`100`	`+}`
	`101`	`+`
	`102`	`+/*`
	`103`	`+ * Add the accumulators.`
	`104`	`+ */`
	`105`	`+popcnt+=vaddvq_u64(vaddq_u64(accum1,accum2));`
	`106`	`+popcnt+=vaddvq_u64(vaddq_u64(accum3,accum4));`
	`107`	`+`
	`108`	`+/*`
	`109`	`+ * Process remaining 8-byte blocks.`
	`110`	`+ */`
	`111`	`+for (;bytes >=sizeof(uint64);bytes-=sizeof(uint64))`
	`112`	`+{`
	`113`	`+popcnt+=pg_popcount64(((uint64)buf));`
	`114`	`+buf+=sizeof(uint64);`
	`115`	`+}`
	`116`	`+`
	`117`	`+/*`
	`118`	`+ * Process any remaining data byte-by-byte.`
	`119`	`+ */`
	`120`	`+while (bytes--)`
	`121`	`+popcnt+=pg_number_of_ones[(unsignedchar)*buf++];`
	`122`	`+`
	`123`	`+returnpopcnt;`
	`124`	`+}`
	`125`	`+`
	`126`	`+/*`
	`127`	`+ * pg_popcount_masked_optimized`
	`128`	`+ *Returns number of 1 bits in buf after applying the mask to each byte`
	`129`	`+ */`
	`130`	`+uint64`
	`131`	`+pg_popcount_masked_optimized(constchar*buf,intbytes,bits8mask)`
	`132`	`+{`
	`133`	`+uint8x16_tvec,`
	`134`	`+maskv=vdupq_n_u8(mask);`
	`135`	`+uint64x2_taccum1=vdupq_n_u64(0),`
	`136`	`+accum2=vdupq_n_u64(0),`
	`137`	`+accum3=vdupq_n_u64(0),`
	`138`	`+accum4=vdupq_n_u64(0);`
	`139`	`+uint32bytes_per_iteration=4*sizeof(uint8x16_t);`
	`140`	`+uint64popcnt=0,`
	`141`	`+mask64= ~UINT64CONST(0) /0xFF*mask;`
	`142`	`+`
	`143`	`+/*`
	`144`	`+ * For better instruction-level parallelism, each loop iteration operates`
	`145`	`+ * on a block of four registers.`
	`146`	`+ */`
	`147`	`+for (;bytes >=bytes_per_iteration;bytes-=bytes_per_iteration)`
	`148`	`+{`
	`149`	`+vec=vandq_u8(vld1q_u8((constuint8*)buf),maskv);`
	`150`	`+accum1=vpadalq_u32(accum1,vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));`
	`151`	`+buf+=sizeof(uint8x16_t);`
	`152`	`+`
	`153`	`+vec=vandq_u8(vld1q_u8((constuint8*)buf),maskv);`
	`154`	`+accum2=vpadalq_u32(accum2,vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));`
	`155`	`+buf+=sizeof(uint8x16_t);`
	`156`	`+`
	`157`	`+vec=vandq_u8(vld1q_u8((constuint8*)buf),maskv);`
	`158`	`+accum3=vpadalq_u32(accum3,vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));`
	`159`	`+buf+=sizeof(uint8x16_t);`
	`160`	`+`
	`161`	`+vec=vandq_u8(vld1q_u8((constuint8*)buf),maskv);`
	`162`	`+accum4=vpadalq_u32(accum4,vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));`
	`163`	`+buf+=sizeof(uint8x16_t);`
	`164`	`+}`
	`165`	`+`
	`166`	`+/*`
	`167`	`+ * If enough data remains, do another iteration on a block of two`
	`168`	`+ * registers.`
	`169`	`+ */`
	`170`	`+bytes_per_iteration=2*sizeof(uint8x16_t);`
	`171`	`+if (bytes >=bytes_per_iteration)`
	`172`	`+{`
	`173`	`+vec=vandq_u8(vld1q_u8((constuint8*)buf),maskv);`
	`174`	`+accum1=vpadalq_u32(accum1,vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));`
	`175`	`+buf+=sizeof(uint8x16_t);`
	`176`	`+`
	`177`	`+vec=vandq_u8(vld1q_u8((constuint8*)buf),maskv);`
	`178`	`+accum2=vpadalq_u32(accum2,vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));`
	`179`	`+buf+=sizeof(uint8x16_t);`
	`180`	`+`
	`181`	`+bytes-=bytes_per_iteration;`
	`182`	`+}`
	`183`	`+`
	`184`	`+/*`
	`185`	`+ * Add the accumulators.`
	`186`	`+ */`
	`187`	`+popcnt+=vaddvq_u64(vaddq_u64(accum1,accum2));`
	`188`	`+popcnt+=vaddvq_u64(vaddq_u64(accum3,accum4));`
	`189`	`+`
	`190`	`+/*`
	`191`	`+ * Process remining 8-byte blocks.`
	`192`	`+ */`
	`193`	`+for (;bytes >=sizeof(uint64);bytes-=sizeof(uint64))`
	`194`	`+{`
	`195`	`+popcnt+=pg_popcount64(((uint64)buf)&mask64);`
	`196`	`+buf+=sizeof(uint64);`
	`197`	`+}`
	`198`	`+`
	`199`	`+/*`
	`200`	`+ * Process any remaining data byte-by-byte.`
	`201`	`+ */`
	`202`	`+while (bytes--)`
	`203`	`+popcnt+=pg_number_of_ones[(unsignedchar)*buf++&mask];`
	`204`	`+`
	`205`	`+returnpopcnt;`
	`206`	`+}`
	`207`	`+`
	`208`	`+#endif/* POPCNT_AARCH64 */`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit6be53c2

File tree

5 files changed

5 files changed

`‎src/include/port/pg_bitutils.h‎`

`‎src/port/Makefile‎`

`‎src/port/meson.build‎`

`‎src/port/pg_bitutils.c‎`

`‎src/port/pg_popcount_aarch64.c‎`

0 commit comments