Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit41c51f0

Browse files
Optimize visibilitymap_count() with AVX-512 instructions.
Commit792752a added infrastructure for using AVX-512 intrinsicfunctions, and this commit uses that infrastructure to optimizevisibilitymap_count(). Specificially, a new pg_popcount_masked()function is introduced that applies a bitmask to every byte in thebuffer prior to calculating the population count, which is used tofilter out the all-visible or all-frozen bits as needed. Platformswithout AVX-512 support should also see a nice speedup due to thereduced number of calls to a function pointer.Co-authored-by: Ants AasmaDiscussion:https://postgr.es/m/BL1PR11MB5304097DF7EA81D04C33F3D1DCA6A%40BL1PR11MB5304.namprd11.prod.outlook.com
1 parent792752a commit41c51f0

File tree

4 files changed

+225
-20
lines changed

4 files changed

+225
-20
lines changed

‎src/backend/access/heap/visibilitymap.c

Lines changed: 5 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -119,10 +119,8 @@
119119
#defineHEAPBLK_TO_OFFSET(x) (((x) % HEAPBLOCKS_PER_BYTE) * BITS_PER_HEAPBLOCK)
120120

121121
/* Masks for counting subsets of bits in the visibility map. */
122-
#defineVISIBLE_MASK64UINT64CONST(0x5555555555555555)/* The lower bit of each
123-
* bit pair */
124-
#defineFROZEN_MASK64UINT64CONST(0xaaaaaaaaaaaaaaaa)/* The upper bit of each
125-
* bit pair */
122+
#defineVISIBLE_MASK8(0x55)/* The lower bit of each bit pair */
123+
#defineFROZEN_MASK8(0xaa)/* The upper bit of each bit pair */
126124

127125
/* prototypes for internal routines */
128126
staticBuffervm_readbuf(Relationrel,BlockNumberblkno,boolextend);
@@ -396,7 +394,6 @@ visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_fro
396394
{
397395
BuffermapBuffer;
398396
uint64*map;
399-
inti;
400397

401398
/*
402399
* Read till we fall off the end of the map. We assume that any extra
@@ -414,21 +411,9 @@ visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_fro
414411
*/
415412
map= (uint64*)PageGetContents(BufferGetPage(mapBuffer));
416413

417-
StaticAssertStmt(MAPSIZE %sizeof(uint64)==0,
418-
"unsupported MAPSIZE");
419-
if (all_frozen==NULL)
420-
{
421-
for (i=0;i<MAPSIZE /sizeof(uint64);i++)
422-
nvisible+=pg_popcount64(map[i]&VISIBLE_MASK64);
423-
}
424-
else
425-
{
426-
for (i=0;i<MAPSIZE /sizeof(uint64);i++)
427-
{
428-
nvisible+=pg_popcount64(map[i]&VISIBLE_MASK64);
429-
nfrozen+=pg_popcount64(map[i]&FROZEN_MASK64);
430-
}
431-
}
414+
nvisible+=pg_popcount_masked((constchar*)map,MAPSIZE,VISIBLE_MASK8);
415+
if (all_frozen)
416+
nfrozen+=pg_popcount_masked((constchar*)map,MAPSIZE,FROZEN_MASK8);
432417

433418
ReleaseBuffer(mapBuffer);
434419
}

‎src/include/port/pg_bitutils.h

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,7 @@ pg_ceil_log2_64(uint64 num)
303303
externPGDLLIMPORTint (*pg_popcount32) (uint32word);
304304
externPGDLLIMPORTint (*pg_popcount64) (uint64word);
305305
externPGDLLIMPORTuint64 (*pg_popcount_optimized) (constchar*buf,intbytes);
306+
externPGDLLIMPORTuint64 (*pg_popcount_masked_optimized) (constchar*buf,intbytes,bits8mask);
306307

307308
/*
308309
* We can also try to use the AVX-512 popcount instruction on some systems.
@@ -313,13 +314,15 @@ extern PGDLLIMPORT uint64 (*pg_popcount_optimized) (const char *buf, int bytes);
313314
#ifdefUSE_AVX512_POPCNT_WITH_RUNTIME_CHECK
314315
externboolpg_popcount_avx512_available(void);
315316
externuint64pg_popcount_avx512(constchar*buf,intbytes);
317+
externuint64pg_popcount_masked_avx512(constchar*buf,intbytes,bits8mask);
316318
#endif
317319

318320
#else
319321
/* Use a portable implementation -- no need for a function pointer. */
320322
externintpg_popcount32(uint32word);
321323
externintpg_popcount64(uint64word);
322324
externuint64pg_popcount_optimized(constchar*buf,intbytes);
325+
externuint64pg_popcount_masked_optimized(constchar*buf,intbytes,bits8mask);
323326

324327
#endif/* TRY_POPCNT_FAST */
325328

@@ -357,6 +360,37 @@ pg_popcount(const char *buf, int bytes)
357360
returnpg_popcount_optimized(buf,bytes);
358361
}
359362

363+
/*
364+
* Returns the number of 1-bits in buf after applying the mask to each byte.
365+
*
366+
* Similar to pg_popcount(), we only take on the function pointer overhead when
367+
* it's likely to be faster.
368+
*/
369+
staticinlineuint64
370+
pg_popcount_masked(constchar*buf,intbytes,bits8mask)
371+
{
372+
/*
373+
* We set the threshold to the point at which we'll first use special
374+
* instructions in the optimized version.
375+
*/
376+
#ifSIZEOF_VOID_P >=8
377+
intthreshold=8;
378+
#else
379+
intthreshold=4;
380+
#endif
381+
382+
if (bytes<threshold)
383+
{
384+
uint64popcnt=0;
385+
386+
while (bytes--)
387+
popcnt+=pg_number_of_ones[(unsignedchar)*buf++&mask];
388+
returnpopcnt;
389+
}
390+
391+
returnpg_popcount_masked_optimized(buf,bytes,mask);
392+
}
393+
360394
/*
361395
* Rotate the bits of "word" to the right/left by n bits.
362396
*/

‎src/port/pg_bitutils.c

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,19 +106,23 @@ const uint8 pg_number_of_ones[256] = {
106106
staticinlineintpg_popcount32_slow(uint32word);
107107
staticinlineintpg_popcount64_slow(uint64word);
108108
staticuint64pg_popcount_slow(constchar*buf,intbytes);
109+
staticuint64pg_popcount_masked_slow(constchar*buf,intbytes,bits8mask);
109110

110111
#ifdefTRY_POPCNT_FAST
111112
staticboolpg_popcount_available(void);
112113
staticintpg_popcount32_choose(uint32word);
113114
staticintpg_popcount64_choose(uint64word);
114115
staticuint64pg_popcount_choose(constchar*buf,intbytes);
116+
staticuint64pg_popcount_masked_choose(constchar*buf,intbytes,bits8mask);
115117
staticinlineintpg_popcount32_fast(uint32word);
116118
staticinlineintpg_popcount64_fast(uint64word);
117119
staticuint64pg_popcount_fast(constchar*buf,intbytes);
120+
staticuint64pg_popcount_masked_fast(constchar*buf,intbytes,bits8mask);
118121

119122
int(*pg_popcount32) (uint32word)=pg_popcount32_choose;
120123
int(*pg_popcount64) (uint64word)=pg_popcount64_choose;
121124
uint64(*pg_popcount_optimized) (constchar*buf,intbytes)=pg_popcount_choose;
125+
uint64(*pg_popcount_masked_optimized) (constchar*buf,intbytes,bits8mask)=pg_popcount_masked_choose;
122126
#endif/* TRY_POPCNT_FAST */
123127

124128
#ifdefTRY_POPCNT_FAST
@@ -156,17 +160,22 @@ choose_popcount_functions(void)
156160
pg_popcount32=pg_popcount32_fast;
157161
pg_popcount64=pg_popcount64_fast;
158162
pg_popcount_optimized=pg_popcount_fast;
163+
pg_popcount_masked_optimized=pg_popcount_masked_fast;
159164
}
160165
else
161166
{
162167
pg_popcount32=pg_popcount32_slow;
163168
pg_popcount64=pg_popcount64_slow;
164169
pg_popcount_optimized=pg_popcount_slow;
170+
pg_popcount_masked_optimized=pg_popcount_masked_slow;
165171
}
166172

167173
#ifdefUSE_AVX512_POPCNT_WITH_RUNTIME_CHECK
168174
if (pg_popcount_avx512_available())
175+
{
169176
pg_popcount_optimized=pg_popcount_avx512;
177+
pg_popcount_masked_optimized=pg_popcount_masked_avx512;
178+
}
170179
#endif
171180
}
172181

@@ -191,6 +200,13 @@ pg_popcount_choose(const char *buf, int bytes)
191200
returnpg_popcount_optimized(buf,bytes);
192201
}
193202

203+
staticuint64
204+
pg_popcount_masked_choose(constchar*buf,intbytes,bits8mask)
205+
{
206+
choose_popcount_functions();
207+
returnpg_popcount_masked(buf,bytes,mask);
208+
}
209+
194210
/*
195211
* pg_popcount32_fast
196212
*Return the number of 1 bits set in word
@@ -271,6 +287,56 @@ pg_popcount_fast(const char *buf, int bytes)
271287
returnpopcnt;
272288
}
273289

290+
/*
291+
* pg_popcount_masked_fast
292+
*Returns the number of 1-bits in buf after applying the mask to each byte
293+
*/
294+
staticuint64
295+
pg_popcount_masked_fast(constchar*buf,intbytes,bits8mask)
296+
{
297+
uint64popcnt=0;
298+
299+
#ifSIZEOF_VOID_P >=8
300+
/* Process in 64-bit chunks if the buffer is aligned */
301+
uint64maskv= ~UINT64CONST(0) /0xFF*mask;
302+
303+
if (buf== (constchar*)TYPEALIGN(8,buf))
304+
{
305+
constuint64*words= (constuint64*)buf;
306+
307+
while (bytes >=8)
308+
{
309+
popcnt+=pg_popcount64_fast(*words++&maskv);
310+
bytes-=8;
311+
}
312+
313+
buf= (constchar*)words;
314+
}
315+
#else
316+
/* Process in 32-bit chunks if the buffer is aligned. */
317+
uint32maskv= ~((uint32)0) /0xFF*mask;
318+
319+
if (buf== (constchar*)TYPEALIGN(4,buf))
320+
{
321+
constuint32*words= (constuint32*)buf;
322+
323+
while (bytes >=4)
324+
{
325+
popcnt+=pg_popcount32_fast(*words++&maskv);
326+
bytes-=4;
327+
}
328+
329+
buf= (constchar*)words;
330+
}
331+
#endif
332+
333+
/* Process any remaining bytes */
334+
while (bytes--)
335+
popcnt+=pg_number_of_ones[(unsignedchar)*buf++&mask];
336+
337+
returnpopcnt;
338+
}
339+
274340
#endif/* TRY_POPCNT_FAST */
275341

276342

@@ -370,6 +436,56 @@ pg_popcount_slow(const char *buf, int bytes)
370436
returnpopcnt;
371437
}
372438

439+
/*
440+
* pg_popcount_masked_slow
441+
*Returns the number of 1-bits in buf after applying the mask to each byte
442+
*/
443+
staticuint64
444+
pg_popcount_masked_slow(constchar*buf,intbytes,bits8mask)
445+
{
446+
uint64popcnt=0;
447+
448+
#ifSIZEOF_VOID_P >=8
449+
/* Process in 64-bit chunks if the buffer is aligned */
450+
uint64maskv= ~UINT64CONST(0) /0xFF*mask;
451+
452+
if (buf== (constchar*)TYPEALIGN(8,buf))
453+
{
454+
constuint64*words= (constuint64*)buf;
455+
456+
while (bytes >=8)
457+
{
458+
popcnt+=pg_popcount64_slow(*words++&maskv);
459+
bytes-=8;
460+
}
461+
462+
buf= (constchar*)words;
463+
}
464+
#else
465+
/* Process in 32-bit chunks if the buffer is aligned. */
466+
uint32maskv= ~((uint32)0) /0xFF*mask;
467+
468+
if (buf== (constchar*)TYPEALIGN(4,buf))
469+
{
470+
constuint32*words= (constuint32*)buf;
471+
472+
while (bytes >=4)
473+
{
474+
popcnt+=pg_popcount32_slow(*words++&maskv);
475+
bytes-=4;
476+
}
477+
478+
buf= (constchar*)words;
479+
}
480+
#endif
481+
482+
/* Process any remaining bytes */
483+
while (bytes--)
484+
popcnt+=pg_number_of_ones[(unsignedchar)*buf++&mask];
485+
486+
returnpopcnt;
487+
}
488+
373489
#ifndefTRY_POPCNT_FAST
374490

375491
/*
@@ -401,4 +517,14 @@ pg_popcount_optimized(const char *buf, int bytes)
401517
returnpg_popcount_slow(buf,bytes);
402518
}
403519

520+
/*
521+
* pg_popcount_masked_optimized
522+
*Returns the number of 1-bits in buf after applying the mask to each byte
523+
*/
524+
uint64
525+
pg_popcount_masked_optimized(constchar*buf,intbytes,bits8mask)
526+
{
527+
returnpg_popcount_masked_slow(buf,bytes,mask);
528+
}
529+
404530
#endif/* !TRY_POPCNT_FAST */

‎src/port/pg_popcount_avx512.c

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,4 +78,64 @@ pg_popcount_avx512(const char *buf, int bytes)
7878
return_mm512_reduce_add_epi64(accum);
7979
}
8080

81+
/*
82+
* pg_popcount_masked_avx512
83+
*Returns the number of 1-bits in buf after applying the mask to each byte
84+
*/
85+
uint64
86+
pg_popcount_masked_avx512(constchar*buf,intbytes,bits8mask)
87+
{
88+
__m512ival,
89+
vmasked,
90+
cnt;
91+
__m512iaccum=_mm512_setzero_si512();
92+
constchar*final;
93+
inttail_idx;
94+
__mmask64bmask= ~UINT64CONST(0);
95+
const__m512imaskv=_mm512_set1_epi8(mask);
96+
97+
/*
98+
* Align buffer down to avoid double load overhead from unaligned access.
99+
* Calculate a mask to ignore preceding bytes. Find start offset of final
100+
* iteration and ensure it is not empty.
101+
*/
102+
bmask <<= ((uintptr_t)buf) %sizeof(__m512i);
103+
tail_idx= (((uintptr_t)buf+bytes-1) %sizeof(__m512i))+1;
104+
final= (constchar*)TYPEALIGN_DOWN(sizeof(__m512i),buf+bytes-1);
105+
buf= (constchar*)TYPEALIGN_DOWN(sizeof(__m512i),buf);
106+
107+
/*
108+
* Iterate through all but the final iteration. Starting from the second
109+
* iteration, the mask is ignored.
110+
*/
111+
if (buf<final)
112+
{
113+
val=_mm512_maskz_loadu_epi8(bmask, (const__m512i*)buf);
114+
vmasked=_mm512_and_si512(val,maskv);
115+
cnt=_mm512_popcnt_epi64(vmasked);
116+
accum=_mm512_add_epi64(accum,cnt);
117+
118+
buf+=sizeof(__m512i);
119+
bmask= ~UINT64CONST(0);
120+
121+
for (;buf<final;buf+=sizeof(__m512i))
122+
{
123+
val=_mm512_load_si512((const__m512i*)buf);
124+
vmasked=_mm512_and_si512(val,maskv);
125+
cnt=_mm512_popcnt_epi64(vmasked);
126+
accum=_mm512_add_epi64(accum,cnt);
127+
}
128+
}
129+
130+
/* Final iteration needs to ignore bytes that are not within the length */
131+
bmask &= (~UINT64CONST(0) >> (sizeof(__m512i)-tail_idx));
132+
133+
val=_mm512_maskz_loadu_epi8(bmask, (const__m512i*)buf);
134+
vmasked=_mm512_and_si512(val,maskv);
135+
cnt=_mm512_popcnt_epi64(vmasked);
136+
accum=_mm512_add_epi64(accum,cnt);
137+
138+
return_mm512_reduce_add_epi64(accum);
139+
}
140+
81141
#endif/* TRY_POPCNT_FAST */

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp