Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit6be53c2

Browse files
Optimize popcount functions with ARM Neon intrinsics.
This commit introduces Neon implementations of pg_popcount{32,64},pg_popcount(), and pg_popcount_masked(). As in simd.h, we assumethat all available AArch64 hardware supports Neon, so we don't needany new configure-time or runtime checks. Some compilers alreadyemit Neon instructions for these functions, but our hand-rolledimplementations for pg_popcount() and pg_popcount_masked()performed better in testing, likely due to better instruction-levelparallelism.Author: "Chiranmoy.Bhattacharya@fujitsu.com" <Chiranmoy.Bhattacharya@fujitsu.com>Reviewed-by: John Naylor <johncnaylorls@gmail.com>Discussion:https://postgr.es/m/010101936e4aaa70-b474ab9e-b9ce-474d-a3ba-a3dc223d295c-000000%40us-west-2.amazonses.com
1 parent51a0382 commit6be53c2

File tree

5 files changed

+235
-6
lines changed

5 files changed

+235
-6
lines changed

‎src/include/port/pg_bitutils.h‎

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,15 @@ pg_ceil_log2_64(uint64 num)
298298
#endif
299299
#endif
300300

301+
/*
302+
* On AArch64, we can use Neon instructions if the compiler provides access to
303+
* them (as indicated by __ARM_NEON). As in simd.h, we assume that all
304+
* available 64-bit hardware has Neon support.
305+
*/
306+
#if defined(__aarch64__)&& defined(__ARM_NEON)
307+
#definePOPCNT_AARCH64 1
308+
#endif
309+
301310
#ifdefTRY_POPCNT_X86_64
302311
/* Attempt to use the POPCNT instruction, but perform a runtime check first */
303312
externPGDLLIMPORTint (*pg_popcount32) (uint32word);

‎src/port/Makefile‎

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ OBJS = \
4646
path.o\
4747
pg_bitutils.o\
4848
pg_localeconv_r.o\
49+
pg_popcount_aarch64.o\
4950
pg_popcount_avx512.o\
5051
pg_strong_random.o\
5152
pgcheckdir.o\

‎src/port/meson.build‎

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ pgport_sources = [
99
'path.c',
1010
'pg_bitutils.c',
1111
'pg_localeconv_r.c',
12+
'pg_popcount_aarch64.c',
1213
'pg_popcount_avx512.c',
1314
'pg_strong_random.c',
1415
'pgcheckdir.c',

‎src/port/pg_bitutils.c‎

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -103,10 +103,15 @@ const uint8 pg_number_of_ones[256] = {
103103
4,5,5,6,5,6,6,7,5,6,6,7,6,7,7,8
104104
};
105105

106+
/*
107+
* If we are building the Neon versions, we don't need the "slow" fallbacks.
108+
*/
109+
#ifndefPOPCNT_AARCH64
106110
staticinlineintpg_popcount32_slow(uint32word);
107111
staticinlineintpg_popcount64_slow(uint64word);
108112
staticuint64pg_popcount_slow(constchar*buf,intbytes);
109113
staticuint64pg_popcount_masked_slow(constchar*buf,intbytes,bits8mask);
114+
#endif
110115

111116
#ifdefTRY_POPCNT_X86_64
112117
staticboolpg_popcount_available(void);
@@ -339,6 +344,10 @@ pg_popcount_masked_fast(const char *buf, int bytes, bits8 mask)
339344

340345
#endif/* TRY_POPCNT_X86_64 */
341346

347+
/*
348+
* If we are building the Neon versions, we don't need the "slow" fallbacks.
349+
*/
350+
#ifndefPOPCNT_AARCH64
342351

343352
/*
344353
* pg_popcount32_slow
@@ -486,14 +495,15 @@ pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask)
486495
returnpopcnt;
487496
}
488497

489-
#ifndefTRY_POPCNT_X86_64
498+
#endif/* ! POPCNT_AARCH64 */
499+
500+
#if !defined(TRY_POPCNT_X86_64)&& !defined(POPCNT_AARCH64)
490501

491502
/*
492-
* Whenthe POPCNT instruction is not available, there's no point in using
503+
* Whenspecial CPU instructions are not available, there's no point in using
493504
* function pointers to vary the implementation between the fast and slow
494-
* method. We instead just make these actual external functions when
495-
* TRY_POPCNT_X86_64 is not defined. The compiler should be able to inline
496-
* the slow versions here.
505+
* method. We instead just make these actual external functions. The compiler
506+
* should be able to inline the slow versions here.
497507
*/
498508
int
499509
pg_popcount32(uint32word)
@@ -527,4 +537,4 @@ pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask)
527537
returnpg_popcount_masked_slow(buf,bytes,mask);
528538
}
529539

530-
#endif/* !TRY_POPCNT_X86_64 */
540+
#endif/* !TRY_POPCNT_X86_64 && ! POPCNT_AARCH64 */

‎src/port/pg_popcount_aarch64.c‎

Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,208 @@
1+
/*-------------------------------------------------------------------------
2+
*
3+
* pg_popcount_aarch64.c
4+
* Holds the AArch64 popcount implementations.
5+
*
6+
* Copyright (c) 2025, PostgreSQL Global Development Group
7+
*
8+
* IDENTIFICATION
9+
* src/port/pg_popcount_aarch64.c
10+
*
11+
*-------------------------------------------------------------------------
12+
*/
13+
#include"c.h"
14+
15+
#include"port/pg_bitutils.h"
16+
17+
#ifdefPOPCNT_AARCH64
18+
19+
#include<arm_neon.h>
20+
21+
/*
22+
* pg_popcount32
23+
*Return number of 1 bits in word
24+
*/
25+
int
26+
pg_popcount32(uint32word)
27+
{
28+
returnpg_popcount64((uint64)word);
29+
}
30+
31+
/*
32+
* pg_popcount64
33+
*Return number of 1 bits in word
34+
*/
35+
int
36+
pg_popcount64(uint64word)
37+
{
38+
/*
39+
* For some compilers, __builtin_popcountl() already emits Neon
40+
* instructions. The line below should compile to the same code on those
41+
* systems.
42+
*/
43+
returnvaddv_u8(vcnt_u8(vld1_u8((constuint8*)&word)));
44+
}
45+
46+
/*
47+
* pg_popcount_optimized
48+
*Returns number of 1 bits in buf
49+
*/
50+
uint64
51+
pg_popcount_optimized(constchar*buf,intbytes)
52+
{
53+
uint8x16_tvec;
54+
uint64x2_taccum1=vdupq_n_u64(0),
55+
accum2=vdupq_n_u64(0),
56+
accum3=vdupq_n_u64(0),
57+
accum4=vdupq_n_u64(0);
58+
uint32bytes_per_iteration=4*sizeof(uint8x16_t);
59+
uint64popcnt=0;
60+
61+
/*
62+
* For better instruction-level parallelism, each loop iteration operates
63+
* on a block of four registers.
64+
*/
65+
for (;bytes >=bytes_per_iteration;bytes-=bytes_per_iteration)
66+
{
67+
vec=vld1q_u8((constuint8*)buf);
68+
accum1=vpadalq_u32(accum1,vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
69+
buf+=sizeof(uint8x16_t);
70+
71+
vec=vld1q_u8((constuint8*)buf);
72+
accum2=vpadalq_u32(accum2,vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
73+
buf+=sizeof(uint8x16_t);
74+
75+
vec=vld1q_u8((constuint8*)buf);
76+
accum3=vpadalq_u32(accum3,vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
77+
buf+=sizeof(uint8x16_t);
78+
79+
vec=vld1q_u8((constuint8*)buf);
80+
accum4=vpadalq_u32(accum4,vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
81+
buf+=sizeof(uint8x16_t);
82+
}
83+
84+
/*
85+
* If enough data remains, do another iteration on a block of two
86+
* registers.
87+
*/
88+
bytes_per_iteration=2*sizeof(uint8x16_t);
89+
if (bytes >=bytes_per_iteration)
90+
{
91+
vec=vld1q_u8((constuint8*)buf);
92+
accum1=vpadalq_u32(accum1,vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
93+
buf+=sizeof(uint8x16_t);
94+
95+
vec=vld1q_u8((constuint8*)buf);
96+
accum2=vpadalq_u32(accum2,vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
97+
buf+=sizeof(uint8x16_t);
98+
99+
bytes-=bytes_per_iteration;
100+
}
101+
102+
/*
103+
* Add the accumulators.
104+
*/
105+
popcnt+=vaddvq_u64(vaddq_u64(accum1,accum2));
106+
popcnt+=vaddvq_u64(vaddq_u64(accum3,accum4));
107+
108+
/*
109+
* Process remaining 8-byte blocks.
110+
*/
111+
for (;bytes >=sizeof(uint64);bytes-=sizeof(uint64))
112+
{
113+
popcnt+=pg_popcount64(*((uint64*)buf));
114+
buf+=sizeof(uint64);
115+
}
116+
117+
/*
118+
* Process any remaining data byte-by-byte.
119+
*/
120+
while (bytes--)
121+
popcnt+=pg_number_of_ones[(unsignedchar)*buf++];
122+
123+
returnpopcnt;
124+
}
125+
126+
/*
127+
* pg_popcount_masked_optimized
128+
*Returns number of 1 bits in buf after applying the mask to each byte
129+
*/
130+
uint64
131+
pg_popcount_masked_optimized(constchar*buf,intbytes,bits8mask)
132+
{
133+
uint8x16_tvec,
134+
maskv=vdupq_n_u8(mask);
135+
uint64x2_taccum1=vdupq_n_u64(0),
136+
accum2=vdupq_n_u64(0),
137+
accum3=vdupq_n_u64(0),
138+
accum4=vdupq_n_u64(0);
139+
uint32bytes_per_iteration=4*sizeof(uint8x16_t);
140+
uint64popcnt=0,
141+
mask64= ~UINT64CONST(0) /0xFF*mask;
142+
143+
/*
144+
* For better instruction-level parallelism, each loop iteration operates
145+
* on a block of four registers.
146+
*/
147+
for (;bytes >=bytes_per_iteration;bytes-=bytes_per_iteration)
148+
{
149+
vec=vandq_u8(vld1q_u8((constuint8*)buf),maskv);
150+
accum1=vpadalq_u32(accum1,vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
151+
buf+=sizeof(uint8x16_t);
152+
153+
vec=vandq_u8(vld1q_u8((constuint8*)buf),maskv);
154+
accum2=vpadalq_u32(accum2,vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
155+
buf+=sizeof(uint8x16_t);
156+
157+
vec=vandq_u8(vld1q_u8((constuint8*)buf),maskv);
158+
accum3=vpadalq_u32(accum3,vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
159+
buf+=sizeof(uint8x16_t);
160+
161+
vec=vandq_u8(vld1q_u8((constuint8*)buf),maskv);
162+
accum4=vpadalq_u32(accum4,vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
163+
buf+=sizeof(uint8x16_t);
164+
}
165+
166+
/*
167+
* If enough data remains, do another iteration on a block of two
168+
* registers.
169+
*/
170+
bytes_per_iteration=2*sizeof(uint8x16_t);
171+
if (bytes >=bytes_per_iteration)
172+
{
173+
vec=vandq_u8(vld1q_u8((constuint8*)buf),maskv);
174+
accum1=vpadalq_u32(accum1,vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
175+
buf+=sizeof(uint8x16_t);
176+
177+
vec=vandq_u8(vld1q_u8((constuint8*)buf),maskv);
178+
accum2=vpadalq_u32(accum2,vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
179+
buf+=sizeof(uint8x16_t);
180+
181+
bytes-=bytes_per_iteration;
182+
}
183+
184+
/*
185+
* Add the accumulators.
186+
*/
187+
popcnt+=vaddvq_u64(vaddq_u64(accum1,accum2));
188+
popcnt+=vaddvq_u64(vaddq_u64(accum3,accum4));
189+
190+
/*
191+
* Process remining 8-byte blocks.
192+
*/
193+
for (;bytes >=sizeof(uint64);bytes-=sizeof(uint64))
194+
{
195+
popcnt+=pg_popcount64(*((uint64*)buf)&mask64);
196+
buf+=sizeof(uint64);
197+
}
198+
199+
/*
200+
* Process any remaining data byte-by-byte.
201+
*/
202+
while (bytes--)
203+
popcnt+=pg_number_of_ones[(unsignedchar)*buf++&mask];
204+
205+
returnpopcnt;
206+
}
207+
208+
#endif/* POPCNT_AARCH64 */

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp