NotificationsYou must be signed in to change notification settings
Fork6
Star31

Commit56788d2

committed

Allocate consecutive blocks during parallel seqscans

Previously we would allocate blocks to parallel workers during a parallelsequential scan 1 block at a time. Since other workers were likely torequest a block before a worker returns for another block number to workon, this could lead to non-sequential I/O patterns in each worker whichcould cause the operating system's readahead to perform poorly or not atall.Here we change things so that we allocate consecutive "chunks" of blocksto workers and have them work on those until they're done, at which timewe allocate another chunk for the worker. The size of these chunks isbased on the size of the relation.Initial patch here was by Thomas Munro which showed some good improvementsjust having a fixed chunk size of 64 blocks with a simple ramp-down nearthe end of the scan. The revisions of the patch to make the chunk sizebased on the relation size and the adjusted ramp-down in powers of two wasdone by me, along with quite extensive benchmarking to determine theoptimal chunk sizes.For the most part, benchmarks have shown significant performanceimprovements for large parallel sequential scans on Linux, FreeBSD andWindows using SSDs. It's less clear how this affects the performance ofcloud providers. Tests done so far are unable to obtain stable enoughperformance to provide meaningful benchmark results. It is possible thatthis could cause some performance regressions on more obscure filesystems,so we may need to later provide users with some ability to get somethingcloser to the old behavior. For now, let's leave that until we see thatit's really required.Author: Thomas Munro, David RowleyReviewed-by: Ranier Vilela, Soumyadeep Chakraborty, Robert HaasReviewed-by: Amit Kapila, Kirk JamisonDiscussion:https://postgr.es/m/CA+hUKGJ_EErDv41YycXcbMbCBkztA34+z1ts9VQH+ACRuvpxig@mail.gmail.com

1 parent11a68e4 commit56788d2Copy full SHA for 56788d2

File tree

4 files changed

+144

-12

lines changed

src
- backend/access
  - heap
    - heapam.c
  - table
    - tableam.c
- include/access
  - relscan.h
  - tableam.h

4 files changed

+144

-12

lines changed

`‎src/backend/access/heap/heapam.c`

Lines changed: 16 additions & 6 deletions

Original file line number	Diff line number	Diff line change
`@@ -520,12 +520,14 @@ heapgettup(HeapScanDesc scan,`
`520`	`520`	`{`
`521`	`521`	`ParallelBlockTableScanDescpbscan=`
`522`	`522`	`(ParallelBlockTableScanDesc)scan->rs_base.rs_parallel;`
	`523`	`+ParallelBlockTableScanWorkerpbscanwork=`
	`524`	`+(ParallelBlockTableScanWorker)scan->rs_base.rs_private;`
`523`	`525`
`524`	`526`	`table_block_parallelscan_startblock_init(scan->rs_base.rs_rd,`
`525`		`-pbscan);`
	`527`	`+pbscanwork,pbscan);`
`526`	`528`
`527`	`529`	`page=table_block_parallelscan_nextpage(scan->rs_base.rs_rd,`
`528`		`-pbscan);`
	`530`	`+pbscanwork,pbscan);`
`529`	`531`
`530`	`532`	`/* Other processes might have already finished the scan. */`
`531`	`533`	`if (page==InvalidBlockNumber)`
`@@ -720,9 +722,11 @@ heapgettup(HeapScanDesc scan,`
`720`	`722`	`{`
`721`	`723`	`ParallelBlockTableScanDescpbscan=`
`722`	`724`	`(ParallelBlockTableScanDesc)scan->rs_base.rs_parallel;`
	`725`	`+ParallelBlockTableScanWorkerpbscanwork=`
	`726`	`+(ParallelBlockTableScanWorker)scan->rs_base.rs_private;`
`723`	`727`
`724`	`728`	`page=table_block_parallelscan_nextpage(scan->rs_base.rs_rd,`
`725`		`-pbscan);`
	`729`	`+pbscanwork,pbscan);`
`726`	`730`	`finished= (page==InvalidBlockNumber);`
`727`	`731`	`}`
`728`	`732`	`else`
`@@ -834,12 +838,14 @@ heapgettup_pagemode(HeapScanDesc scan,`
`834`	`838`	`{`
`835`	`839`	`ParallelBlockTableScanDescpbscan=`
`836`	`840`	`(ParallelBlockTableScanDesc)scan->rs_base.rs_parallel;`
	`841`	`+ParallelBlockTableScanWorkerpbscanwork=`
	`842`	`+(ParallelBlockTableScanWorker)scan->rs_base.rs_private;`
`837`	`843`
`838`	`844`	`table_block_parallelscan_startblock_init(scan->rs_base.rs_rd,`
`839`		`-pbscan);`
	`845`	`+pbscanwork,pbscan);`
`840`	`846`
`841`	`847`	`page=table_block_parallelscan_nextpage(scan->rs_base.rs_rd,`
`842`		`-pbscan);`
	`848`	`+pbscanwork,pbscan);`
`843`	`849`
`844`	`850`	`/* Other processes might have already finished the scan. */`
`845`	`851`	`if (page==InvalidBlockNumber)`
`@@ -1019,9 +1025,11 @@ heapgettup_pagemode(HeapScanDesc scan,`
`1019`	`1025`	`{`
`1020`	`1026`	`ParallelBlockTableScanDescpbscan=`
`1021`	`1027`	`(ParallelBlockTableScanDesc)scan->rs_base.rs_parallel;`
	`1028`	`+ParallelBlockTableScanWorkerpbscanwork=`
	`1029`	`+(ParallelBlockTableScanWorker)scan->rs_base.rs_private;`
`1022`	`1030`
`1023`	`1031`	`page=table_block_parallelscan_nextpage(scan->rs_base.rs_rd,`
`1024`		`-pbscan);`
	`1032`	`+pbscanwork,pbscan);`
`1025`	`1033`	`finished= (page==InvalidBlockNumber);`
`1026`	`1034`	`}`
`1027`	`1035`	`else`
`@@ -1155,6 +1163,8 @@ heap_beginscan(Relation relation, Snapshot snapshot,`
`1155`	`1163`	`scan->rs_base.rs_nkeys=nkeys;`
`1156`	`1164`	`scan->rs_base.rs_flags=flags;`
`1157`	`1165`	`scan->rs_base.rs_parallel=parallel_scan;`
	`1166`	`+scan->rs_base.rs_private=`
	`1167`	`+palloc(sizeof(ParallelBlockTableScanWorkerData));`
`1158`	`1168`	`scan->rs_strategy=NULL;/* set in initscan */`
`1159`	`1169`
`1160`	`1170`	`/*`

`‎src/backend/access/table/tableam.c`

Lines changed: 113 additions & 5 deletions

Original file line number	Diff line number	Diff line change
`@@ -25,10 +25,24 @@`
`25`	`25`	`#include"access/tableam.h"`
`26`	`26`	`#include"access/xact.h"`
`27`	`27`	`#include"optimizer/plancat.h"`
	`28`	`+#include"port/pg_bitutils.h"`
`28`	`29`	`#include"storage/bufmgr.h"`
`29`	`30`	`#include"storage/shmem.h"`
`30`	`31`	`#include"storage/smgr.h"`
`31`	`32`
	`33`	`+/*`
	`34`	`+ * Constants to control the behavior of block allocation to parallel workers`
	`35`	`+ * during a parallel seqscan. Technically these values do not need to be`
	`36`	`+ * powers of 2, but having them as powers of 2 makes the math more optimal`
	`37`	`+ * and makes the ramp-down stepping more even.`
	`38`	`+ */`
	`39`	`+`
	`40`	`+/* The number of I/O chunks we try to break a parallel seqscan down into */`
	`41`	`+#definePARALLEL_SEQSCAN_NCHUNKS2048`
	`42`	`+/* Ramp down size of allocations when we've only this number of chunks left */`
	`43`	`+#definePARALLEL_SEQSCAN_RAMPDOWN_CHUNKS64`
	`44`	`+/* Cap the size of parallel I/O chunks to this number of blocks */`
	`45`	`+#definePARALLEL_SEQSCAN_MAX_CHUNK_SIZE8192`
`32`	`46`
`33`	`47`	`/* GUC variables */`
`34`	`48`	`char*default_table_access_method=DEFAULT_TABLE_ACCESS_METHOD;`
`@@ -408,10 +422,37 @@ table_block_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan)`
`408`	`422`	`* to set the startblock once.`
`409`	`423`	`*/`
`410`	`424`	`void`
`411`		`-table_block_parallelscan_startblock_init(Relationrel,ParallelBlockTableScanDescpbscan)`
	`425`	`+table_block_parallelscan_startblock_init(Relationrel,`
	`426`	`+ParallelBlockTableScanWorkerpbscanwork,`
	`427`	`+ParallelBlockTableScanDescpbscan)`
`412`	`428`	`{`
`413`	`429`	`BlockNumbersync_startpage=InvalidBlockNumber;`
`414`	`430`
	`431`	`+/* Reset the state we use for controlling allocation size. */`
	`432`	`+memset(pbscanwork,0,sizeof(*pbscanwork));`
	`433`	`+`
	`434`	`+StaticAssertStmt(MaxBlockNumber <=0xFFFFFFFE,`
	`435`	`+"pg_nextpower2_32 may be too small for non-standard BlockNumber width");`
	`436`	`+`
	`437`	`+/*`
	`438`	`+ * We determine the chunk size based on the size of the relation. First we`
	`439`	`+ * split the relation into PARALLEL_SEQSCAN_NCHUNKS chunks but we then`
	`440`	`+ * take the next highest power of 2 number of the chunk size. This means`
	`441`	`+ * we split the relation into somewhere between PARALLEL_SEQSCAN_NCHUNKS`
	`442`	`+ * and PARALLEL_SEQSCAN_NCHUNKS / 2 chunks.`
	`443`	`+ */`
	`444`	`+pbscanwork->phsw_chunk_size=pg_nextpower2_32(Max(pbscan->phs_nblocks /`
	`445`	`+PARALLEL_SEQSCAN_NCHUNKS,1));`
	`446`	`+`
	`447`	`+/*`
	`448`	`+ * Ensure we don't go over the maximum chunk size with larger tables. This`
	`449`	`+ * means we may get much more than PARALLEL_SEQSCAN_NCHUNKS for larger`
	`450`	`+ * tables. Too large a chunk size has been shown to be detrimental to`
	`451`	`+ * synchronous scan performance.`
	`452`	`+ */`
	`453`	`+pbscanwork->phsw_chunk_size=Min(pbscanwork->phsw_chunk_size,`
	`454`	`+PARALLEL_SEQSCAN_MAX_CHUNK_SIZE);`
	`455`	`+`
`415`	`456`	`retry:`
`416`	`457`	`/* Grab the spinlock. */`
`417`	`458`	`SpinLockAcquire(&pbscan->phs_mutex);`
`@@ -451,13 +492,40 @@ table_block_parallelscan_startblock_init(Relation rel, ParallelBlockTableScanDes`
`451`	`492`	`* backend gets an InvalidBlockNumber return.`
`452`	`493`	`*/`
`453`	`494`	`BlockNumber`
`454`		`-table_block_parallelscan_nextpage(Relationrel,ParallelBlockTableScanDescpbscan)`
	`495`	`+table_block_parallelscan_nextpage(Relationrel,`
	`496`	`+ParallelBlockTableScanWorkerpbscanwork,`
	`497`	`+ParallelBlockTableScanDescpbscan)`
`455`	`498`	`{`
`456`	`499`	`BlockNumberpage;`
`457`	`500`	`uint64nallocated;`
`458`	`501`
`459`	`502`	`/*`
`460`		`- * phs_nallocated tracks how many pages have been allocated to workers`
	`503`	`+ * The logic below allocates block numbers out to parallel workers in a`
	`504`	`+ * way that each worker will receive a set of consecutive block numbers to`
	`505`	`+ * scan. Earlier versions of this would allocate the next highest block`
	`506`	`+ * number to the next worker to call this function. This would generally`
	`507`	`+ * result in workers never receiving consecutive block numbers. Some`
	`508`	`+ * operating systems would not detect the sequential I/O pattern due to`
	`509`	`+ * each backend being a different process which could result in poor`
	`510`	`+ * performance due to inefficient or no readahead. To work around this`
	`511`	`+ * issue, we now allocate a range of block numbers for each worker and`
	`512`	`+ * when they come back for another block, we give them the next one in`
	`513`	`+ * that range until the range is complete. When the worker completes the`
	`514`	`+ * range of blocks we then allocate another range for it and return the`
	`515`	`+ * first block number from that range.`
	`516`	`+ *`
	`517`	`+ * Here we name these ranges of blocks "chunks". The initial size of`
	`518`	`+ * these chunks is determined in table_block_parallelscan_startblock_init`
	`519`	`+ * based on the size of the relation. Towards the end of the scan, we`
	`520`	`+ * start making reductions in the size of the chunks in order to attempt`
	`521`	`+ * to divide the remaining work over all the workers as evenly as`
	`522`	`+ * possible.`
	`523`	`+ *`
	`524`	`+ * Here pbscanwork is local worker memory. phsw_chunk_remaining tracks`
	`525`	`+ * the number of blocks remaining in the chunk. When that reaches 0 then`
	`526`	`+ * we must allocate a new chunk for the worker.`
	`527`	`+ *`
	`528`	`+ * phs_nallocated tracks how many blocks have been allocated to workers`
`461`	`529`	`* already. When phs_nallocated >= rs_nblocks, all blocks have been`
`462`	`530`	`* allocated.`
`463`	`531`	`*`
`@@ -468,10 +536,50 @@ table_block_parallelscan_nextpage(Relation rel, ParallelBlockTableScanDesc pbsca`
`468`	`536`	`* wide because of that, to avoid wrapping around when rs_nblocks is close`
`469`	`537`	`* to 2^32.`
`470`	`538`	`*`
`471`		`- * The actualpage to return is calculated by adding the counter to the`
	`539`	`+ * The actualblock to return is calculated by adding the counter to the`
`472`	`540`	`* starting block number, modulo nblocks.`
`473`	`541`	`*/`
`474`		`-nallocated=pg_atomic_fetch_add_u64(&pbscan->phs_nallocated,1);`
	`542`	`+`
	`543`	`+/*`
	`544`	`+ * First check if we have any remaining blocks in a previous chunk for`
	`545`	`+ * this worker. We must consume all of the blocks from that before we`
	`546`	`+ * allocate a new chunk to the worker.`
	`547`	`+ */`
	`548`	`+if (pbscanwork->phsw_chunk_remaining>0)`
	`549`	`+{`
	`550`	`+/*`
	`551`	`+ * Give them the next block in the range and update the remaining`
	`552`	`+ * number of blocks.`
	`553`	`+ */`
	`554`	`+nallocated=++pbscanwork->phsw_nallocated;`
	`555`	`+pbscanwork->phsw_chunk_remaining--;`
	`556`	`+}`
	`557`	`+else`
	`558`	`+{`
	`559`	`+/*`
	`560`	`+ * When we've only got PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS chunks`
	`561`	`+ * remaining in the scan, we half the chunk size. Since we reduce the`
	`562`	`+ * chunk size here, we'll hit this again after doing`
	`563`	`+ * PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS at the new size. After a few`
	`564`	`+ * iterations of this, we'll end up doing the last few blocks with the`
	`565`	`+ * chunk size set to 1.`
	`566`	`+ */`
	`567`	`+if (pbscanwork->phsw_chunk_size>1&&`
	`568`	`+pbscanwork->phsw_nallocated>pbscan->phs_nblocks-`
	`569`	`+(pbscanwork->phsw_chunk_size*PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS))`
	`570`	`+pbscanwork->phsw_chunk_size >>=1;`
	`571`	`+`
	`572`	`+nallocated=pbscanwork->phsw_nallocated=`
	`573`	`+pg_atomic_fetch_add_u64(&pbscan->phs_nallocated,`
	`574`	`+pbscanwork->phsw_chunk_size);`
	`575`	`+`
	`576`	`+/*`
	`577`	`+ * Set the remaining number of blocks in this chunk so that subsequent`
	`578`	`+ * calls from this worker continue on with this chunk until it's done.`
	`579`	`+ */`
	`580`	`+pbscanwork->phsw_chunk_remaining=pbscanwork->phsw_chunk_size-1;`
	`581`	`+}`
	`582`	`+`
`475`	`583`	`if (nallocated >=pbscan->phs_nblocks)`
`476`	`584`	`page=InvalidBlockNumber;/* all blocks have been allocated */`
`477`	`585`	`else`

`‎src/include/access/relscan.h`

Lines changed: 13 additions & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -42,9 +42,9 @@ typedef struct TableScanDescData`
`42`	`42`	`*/`
`43`	`43`	`uint32rs_flags;`
`44`	`44`
	`45`	`+voidrs_private;/ per-worker private memory for AM to use */`
`45`	`46`	`structParallelTableScanDescDatars_parallel;/ parallel scan`
`46`	`47`	`* information */`
`47`		`-`
`48`	`48`	`}TableScanDescData;`
`49`	`49`	`typedefstructTableScanDescData*TableScanDesc;`
`50`	`50`
`@@ -81,6 +81,18 @@ typedef struct ParallelBlockTableScanDescData`
`81`	`81`	`}ParallelBlockTableScanDescData;`
`82`	`82`	`typedefstructParallelBlockTableScanDescData*ParallelBlockTableScanDesc;`
`83`	`83`
	`84`	`+/*`
	`85`	`+ * Per backend state for parallel table scan, for block-oriented storage.`
	`86`	`+ */`
	`87`	`+typedefstructParallelBlockTableScanWorkerData`
	`88`	`+{`
	`89`	`+uint64phsw_nallocated;/* Current # of blocks into the scan */`
	`90`	`+uint32phsw_chunk_remaining;/* # blocks left in this chunk */`
	`91`	`+uint32phsw_chunk_size;/* The number of blocks to allocate in`
	`92`	`+ * each I/O chunk for the scan */`
	`93`	`+}ParallelBlockTableScanWorkerData;`
	`94`	`+typedefstructParallelBlockTableScanWorkerData*ParallelBlockTableScanWorker;`
	`95`	`+`
`84`	`96`	`/*`
`85`	`97`	`* Base class for fetches from a table via an index. This is the base-class`
`86`	`98`	`* for such scans, which needs to be embedded in the respective struct for`

`‎src/include/access/tableam.h`

Lines changed: 2 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -1793,8 +1793,10 @@ extern Size table_block_parallelscan_initialize(Relation rel,`
`1793`	`1793`	`externvoidtable_block_parallelscan_reinitialize(Relationrel,`
`1794`	`1794`	`ParallelTableScanDescpscan);`
`1795`	`1795`	`externBlockNumbertable_block_parallelscan_nextpage(Relationrel,`
	`1796`	`+ParallelBlockTableScanWorkerpbscanwork,`
`1796`	`1797`	`ParallelBlockTableScanDescpbscan);`
`1797`	`1798`	`externvoidtable_block_parallelscan_startblock_init(Relationrel,`
	`1799`	`+ParallelBlockTableScanWorkerpbscanwork,`
`1798`	`1800`	`ParallelBlockTableScanDescpbscan);`
`1799`	`1801`
`1800`	`1802`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit56788d2

File tree

4 files changed

4 files changed

`‎src/backend/access/heap/heapam.c`

`‎src/backend/access/table/tableam.c`

`‎src/include/access/relscan.h`

`‎src/include/access/tableam.h`

0 commit comments