Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit56788d2

Browse files
committed
Allocate consecutive blocks during parallel seqscans
Previously we would allocate blocks to parallel workers during a parallelsequential scan 1 block at a time. Since other workers were likely torequest a block before a worker returns for another block number to workon, this could lead to non-sequential I/O patterns in each worker whichcould cause the operating system's readahead to perform poorly or not atall.Here we change things so that we allocate consecutive "chunks" of blocksto workers and have them work on those until they're done, at which timewe allocate another chunk for the worker. The size of these chunks isbased on the size of the relation.Initial patch here was by Thomas Munro which showed some good improvementsjust having a fixed chunk size of 64 blocks with a simple ramp-down nearthe end of the scan. The revisions of the patch to make the chunk sizebased on the relation size and the adjusted ramp-down in powers of two wasdone by me, along with quite extensive benchmarking to determine theoptimal chunk sizes.For the most part, benchmarks have shown significant performanceimprovements for large parallel sequential scans on Linux, FreeBSD andWindows using SSDs. It's less clear how this affects the performance ofcloud providers. Tests done so far are unable to obtain stable enoughperformance to provide meaningful benchmark results. It is possible thatthis could cause some performance regressions on more obscure filesystems,so we may need to later provide users with some ability to get somethingcloser to the old behavior. For now, let's leave that until we see thatit's really required.Author: Thomas Munro, David RowleyReviewed-by: Ranier Vilela, Soumyadeep Chakraborty, Robert HaasReviewed-by: Amit Kapila, Kirk JamisonDiscussion:https://postgr.es/m/CA+hUKGJ_EErDv41YycXcbMbCBkztA34+z1ts9VQH+ACRuvpxig@mail.gmail.com
1 parent11a68e4 commit56788d2

File tree

4 files changed

+144
-12
lines changed

4 files changed

+144
-12
lines changed

‎src/backend/access/heap/heapam.c

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -520,12 +520,14 @@ heapgettup(HeapScanDesc scan,
520520
{
521521
ParallelBlockTableScanDescpbscan=
522522
(ParallelBlockTableScanDesc)scan->rs_base.rs_parallel;
523+
ParallelBlockTableScanWorkerpbscanwork=
524+
(ParallelBlockTableScanWorker)scan->rs_base.rs_private;
523525

524526
table_block_parallelscan_startblock_init(scan->rs_base.rs_rd,
525-
pbscan);
527+
pbscanwork,pbscan);
526528

527529
page=table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
528-
pbscan);
530+
pbscanwork,pbscan);
529531

530532
/* Other processes might have already finished the scan. */
531533
if (page==InvalidBlockNumber)
@@ -720,9 +722,11 @@ heapgettup(HeapScanDesc scan,
720722
{
721723
ParallelBlockTableScanDescpbscan=
722724
(ParallelBlockTableScanDesc)scan->rs_base.rs_parallel;
725+
ParallelBlockTableScanWorkerpbscanwork=
726+
(ParallelBlockTableScanWorker)scan->rs_base.rs_private;
723727

724728
page=table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
725-
pbscan);
729+
pbscanwork,pbscan);
726730
finished= (page==InvalidBlockNumber);
727731
}
728732
else
@@ -834,12 +838,14 @@ heapgettup_pagemode(HeapScanDesc scan,
834838
{
835839
ParallelBlockTableScanDescpbscan=
836840
(ParallelBlockTableScanDesc)scan->rs_base.rs_parallel;
841+
ParallelBlockTableScanWorkerpbscanwork=
842+
(ParallelBlockTableScanWorker)scan->rs_base.rs_private;
837843

838844
table_block_parallelscan_startblock_init(scan->rs_base.rs_rd,
839-
pbscan);
845+
pbscanwork,pbscan);
840846

841847
page=table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
842-
pbscan);
848+
pbscanwork,pbscan);
843849

844850
/* Other processes might have already finished the scan. */
845851
if (page==InvalidBlockNumber)
@@ -1019,9 +1025,11 @@ heapgettup_pagemode(HeapScanDesc scan,
10191025
{
10201026
ParallelBlockTableScanDescpbscan=
10211027
(ParallelBlockTableScanDesc)scan->rs_base.rs_parallel;
1028+
ParallelBlockTableScanWorkerpbscanwork=
1029+
(ParallelBlockTableScanWorker)scan->rs_base.rs_private;
10221030

10231031
page=table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
1024-
pbscan);
1032+
pbscanwork,pbscan);
10251033
finished= (page==InvalidBlockNumber);
10261034
}
10271035
else
@@ -1155,6 +1163,8 @@ heap_beginscan(Relation relation, Snapshot snapshot,
11551163
scan->rs_base.rs_nkeys=nkeys;
11561164
scan->rs_base.rs_flags=flags;
11571165
scan->rs_base.rs_parallel=parallel_scan;
1166+
scan->rs_base.rs_private=
1167+
palloc(sizeof(ParallelBlockTableScanWorkerData));
11581168
scan->rs_strategy=NULL;/* set in initscan */
11591169

11601170
/*

‎src/backend/access/table/tableam.c

Lines changed: 113 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,24 @@
2525
#include"access/tableam.h"
2626
#include"access/xact.h"
2727
#include"optimizer/plancat.h"
28+
#include"port/pg_bitutils.h"
2829
#include"storage/bufmgr.h"
2930
#include"storage/shmem.h"
3031
#include"storage/smgr.h"
3132

33+
/*
34+
* Constants to control the behavior of block allocation to parallel workers
35+
* during a parallel seqscan. Technically these values do not need to be
36+
* powers of 2, but having them as powers of 2 makes the math more optimal
37+
* and makes the ramp-down stepping more even.
38+
*/
39+
40+
/* The number of I/O chunks we try to break a parallel seqscan down into */
41+
#definePARALLEL_SEQSCAN_NCHUNKS2048
42+
/* Ramp down size of allocations when we've only this number of chunks left */
43+
#definePARALLEL_SEQSCAN_RAMPDOWN_CHUNKS64
44+
/* Cap the size of parallel I/O chunks to this number of blocks */
45+
#definePARALLEL_SEQSCAN_MAX_CHUNK_SIZE8192
3246

3347
/* GUC variables */
3448
char*default_table_access_method=DEFAULT_TABLE_ACCESS_METHOD;
@@ -408,10 +422,37 @@ table_block_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan)
408422
* to set the startblock once.
409423
*/
410424
void
411-
table_block_parallelscan_startblock_init(Relationrel,ParallelBlockTableScanDescpbscan)
425+
table_block_parallelscan_startblock_init(Relationrel,
426+
ParallelBlockTableScanWorkerpbscanwork,
427+
ParallelBlockTableScanDescpbscan)
412428
{
413429
BlockNumbersync_startpage=InvalidBlockNumber;
414430

431+
/* Reset the state we use for controlling allocation size. */
432+
memset(pbscanwork,0,sizeof(*pbscanwork));
433+
434+
StaticAssertStmt(MaxBlockNumber <=0xFFFFFFFE,
435+
"pg_nextpower2_32 may be too small for non-standard BlockNumber width");
436+
437+
/*
438+
* We determine the chunk size based on the size of the relation. First we
439+
* split the relation into PARALLEL_SEQSCAN_NCHUNKS chunks but we then
440+
* take the next highest power of 2 number of the chunk size. This means
441+
* we split the relation into somewhere between PARALLEL_SEQSCAN_NCHUNKS
442+
* and PARALLEL_SEQSCAN_NCHUNKS / 2 chunks.
443+
*/
444+
pbscanwork->phsw_chunk_size=pg_nextpower2_32(Max(pbscan->phs_nblocks /
445+
PARALLEL_SEQSCAN_NCHUNKS,1));
446+
447+
/*
448+
* Ensure we don't go over the maximum chunk size with larger tables. This
449+
* means we may get much more than PARALLEL_SEQSCAN_NCHUNKS for larger
450+
* tables. Too large a chunk size has been shown to be detrimental to
451+
* synchronous scan performance.
452+
*/
453+
pbscanwork->phsw_chunk_size=Min(pbscanwork->phsw_chunk_size,
454+
PARALLEL_SEQSCAN_MAX_CHUNK_SIZE);
455+
415456
retry:
416457
/* Grab the spinlock. */
417458
SpinLockAcquire(&pbscan->phs_mutex);
@@ -451,13 +492,40 @@ table_block_parallelscan_startblock_init(Relation rel, ParallelBlockTableScanDes
451492
* backend gets an InvalidBlockNumber return.
452493
*/
453494
BlockNumber
454-
table_block_parallelscan_nextpage(Relationrel,ParallelBlockTableScanDescpbscan)
495+
table_block_parallelscan_nextpage(Relationrel,
496+
ParallelBlockTableScanWorkerpbscanwork,
497+
ParallelBlockTableScanDescpbscan)
455498
{
456499
BlockNumberpage;
457500
uint64nallocated;
458501

459502
/*
460-
* phs_nallocated tracks how many pages have been allocated to workers
503+
* The logic below allocates block numbers out to parallel workers in a
504+
* way that each worker will receive a set of consecutive block numbers to
505+
* scan. Earlier versions of this would allocate the next highest block
506+
* number to the next worker to call this function. This would generally
507+
* result in workers never receiving consecutive block numbers. Some
508+
* operating systems would not detect the sequential I/O pattern due to
509+
* each backend being a different process which could result in poor
510+
* performance due to inefficient or no readahead. To work around this
511+
* issue, we now allocate a range of block numbers for each worker and
512+
* when they come back for another block, we give them the next one in
513+
* that range until the range is complete. When the worker completes the
514+
* range of blocks we then allocate another range for it and return the
515+
* first block number from that range.
516+
*
517+
* Here we name these ranges of blocks "chunks". The initial size of
518+
* these chunks is determined in table_block_parallelscan_startblock_init
519+
* based on the size of the relation. Towards the end of the scan, we
520+
* start making reductions in the size of the chunks in order to attempt
521+
* to divide the remaining work over all the workers as evenly as
522+
* possible.
523+
*
524+
* Here pbscanwork is local worker memory. phsw_chunk_remaining tracks
525+
* the number of blocks remaining in the chunk. When that reaches 0 then
526+
* we must allocate a new chunk for the worker.
527+
*
528+
* phs_nallocated tracks how many blocks have been allocated to workers
461529
* already. When phs_nallocated >= rs_nblocks, all blocks have been
462530
* allocated.
463531
*
@@ -468,10 +536,50 @@ table_block_parallelscan_nextpage(Relation rel, ParallelBlockTableScanDesc pbsca
468536
* wide because of that, to avoid wrapping around when rs_nblocks is close
469537
* to 2^32.
470538
*
471-
* The actualpage to return is calculated by adding the counter to the
539+
* The actualblock to return is calculated by adding the counter to the
472540
* starting block number, modulo nblocks.
473541
*/
474-
nallocated=pg_atomic_fetch_add_u64(&pbscan->phs_nallocated,1);
542+
543+
/*
544+
* First check if we have any remaining blocks in a previous chunk for
545+
* this worker. We must consume all of the blocks from that before we
546+
* allocate a new chunk to the worker.
547+
*/
548+
if (pbscanwork->phsw_chunk_remaining>0)
549+
{
550+
/*
551+
* Give them the next block in the range and update the remaining
552+
* number of blocks.
553+
*/
554+
nallocated=++pbscanwork->phsw_nallocated;
555+
pbscanwork->phsw_chunk_remaining--;
556+
}
557+
else
558+
{
559+
/*
560+
* When we've only got PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS chunks
561+
* remaining in the scan, we half the chunk size. Since we reduce the
562+
* chunk size here, we'll hit this again after doing
563+
* PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS at the new size. After a few
564+
* iterations of this, we'll end up doing the last few blocks with the
565+
* chunk size set to 1.
566+
*/
567+
if (pbscanwork->phsw_chunk_size>1&&
568+
pbscanwork->phsw_nallocated>pbscan->phs_nblocks-
569+
(pbscanwork->phsw_chunk_size*PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS))
570+
pbscanwork->phsw_chunk_size >>=1;
571+
572+
nallocated=pbscanwork->phsw_nallocated=
573+
pg_atomic_fetch_add_u64(&pbscan->phs_nallocated,
574+
pbscanwork->phsw_chunk_size);
575+
576+
/*
577+
* Set the remaining number of blocks in this chunk so that subsequent
578+
* calls from this worker continue on with this chunk until it's done.
579+
*/
580+
pbscanwork->phsw_chunk_remaining=pbscanwork->phsw_chunk_size-1;
581+
}
582+
475583
if (nallocated >=pbscan->phs_nblocks)
476584
page=InvalidBlockNumber;/* all blocks have been allocated */
477585
else

‎src/include/access/relscan.h

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,9 @@ typedef struct TableScanDescData
4242
*/
4343
uint32rs_flags;
4444

45+
void*rs_private;/* per-worker private memory for AM to use */
4546
structParallelTableScanDescData*rs_parallel;/* parallel scan
4647
* information */
47-
4848
}TableScanDescData;
4949
typedefstructTableScanDescData*TableScanDesc;
5050

@@ -81,6 +81,18 @@ typedef struct ParallelBlockTableScanDescData
8181
}ParallelBlockTableScanDescData;
8282
typedefstructParallelBlockTableScanDescData*ParallelBlockTableScanDesc;
8383

84+
/*
85+
* Per backend state for parallel table scan, for block-oriented storage.
86+
*/
87+
typedefstructParallelBlockTableScanWorkerData
88+
{
89+
uint64phsw_nallocated;/* Current # of blocks into the scan */
90+
uint32phsw_chunk_remaining;/* # blocks left in this chunk */
91+
uint32phsw_chunk_size;/* The number of blocks to allocate in
92+
* each I/O chunk for the scan */
93+
}ParallelBlockTableScanWorkerData;
94+
typedefstructParallelBlockTableScanWorkerData*ParallelBlockTableScanWorker;
95+
8496
/*
8597
* Base class for fetches from a table via an index. This is the base-class
8698
* for such scans, which needs to be embedded in the respective struct for

‎src/include/access/tableam.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1793,8 +1793,10 @@ extern Size table_block_parallelscan_initialize(Relation rel,
17931793
externvoidtable_block_parallelscan_reinitialize(Relationrel,
17941794
ParallelTableScanDescpscan);
17951795
externBlockNumbertable_block_parallelscan_nextpage(Relationrel,
1796+
ParallelBlockTableScanWorkerpbscanwork,
17961797
ParallelBlockTableScanDescpbscan);
17971798
externvoidtable_block_parallelscan_startblock_init(Relationrel,
1799+
ParallelBlockTableScanWorkerpbscanwork,
17981800
ParallelBlockTableScanDescpbscan);
17991801

18001802

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp