Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit5bcf389

Browse files
committed
Fix EXPLAIN ANALYZE of hash join when the leader doesn't participate.
If a hash join appears in a parallel query, there may be no hash tableavailable for explain.c to inspect even though a hash table may havebeen built in other processes. This could happen either becauseparallel_leader_participation was set to off or because the leaderhappened to hit the end of the outer relation immediately (even thoughthe complete relation is not empty) and decided not to build the hashtable.Commitbf11e7e introduced a way for workers to exchangeinstrumentation via the DSM segment for Sort nodes even though theyare not parallel-aware. This commit does the same for Hash nodes, sothat explain.c has a way to find instrumentation data from anarbitrary participant that actually built the hash table.Author: Thomas MunroReviewed-By: Andres FreundDiscussion:https://postgr.es/m/CAEepm%3D3DUQC2-z252N55eOcZBer6DPdM%3DFzrxH9dZc5vYLsjaA%40mail.gmail.com
1 parent82c5c53 commit5bcf389

File tree

8 files changed

+245
-26
lines changed

8 files changed

+245
-26
lines changed

‎src/backend/commands/explain.c

Lines changed: 44 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
#include"commands/createas.h"
2020
#include"commands/defrem.h"
2121
#include"commands/prepare.h"
22-
#include"executor/hashjoin.h"
22+
#include"executor/nodeHash.h"
2323
#include"foreign/fdwapi.h"
2424
#include"nodes/extensible.h"
2525
#include"nodes/nodeFuncs.h"
@@ -2379,42 +2379,70 @@ show_sort_info(SortState *sortstate, ExplainState *es)
23792379
staticvoid
23802380
show_hash_info(HashState*hashstate,ExplainState*es)
23812381
{
2382-
HashJoinTablehashtable;
2382+
HashInstrumentation*hinstrument=NULL;
23832383

2384-
hashtable=hashstate->hashtable;
2384+
/*
2385+
* In a parallel query, the leader process may or may not have run the
2386+
* hash join, and even if it did it may not have built a hash table due to
2387+
* timing (if it started late it might have seen no tuples in the outer
2388+
* relation and skipped building the hash table). Therefore we have to be
2389+
* prepared to get instrumentation data from a worker if there is no hash
2390+
* table.
2391+
*/
2392+
if (hashstate->hashtable)
2393+
{
2394+
hinstrument= (HashInstrumentation*)
2395+
palloc(sizeof(HashInstrumentation));
2396+
ExecHashGetInstrumentation(hinstrument,hashstate->hashtable);
2397+
}
2398+
elseif (hashstate->shared_info)
2399+
{
2400+
SharedHashInfo*shared_info=hashstate->shared_info;
2401+
inti;
2402+
2403+
/* Find the first worker that built a hash table. */
2404+
for (i=0;i<shared_info->num_workers;++i)
2405+
{
2406+
if (shared_info->hinstrument[i].nbatch>0)
2407+
{
2408+
hinstrument=&shared_info->hinstrument[i];
2409+
break;
2410+
}
2411+
}
2412+
}
23852413

2386-
if (hashtable)
2414+
if (hinstrument)
23872415
{
2388-
longspacePeakKb= (hashtable->spacePeak+1023) /1024;
2416+
longspacePeakKb= (hinstrument->space_peak+1023) /1024;
23892417

23902418
if (es->format!=EXPLAIN_FORMAT_TEXT)
23912419
{
2392-
ExplainPropertyLong("Hash Buckets",hashtable->nbuckets,es);
2420+
ExplainPropertyLong("Hash Buckets",hinstrument->nbuckets,es);
23932421
ExplainPropertyLong("Original Hash Buckets",
2394-
hashtable->nbuckets_original,es);
2395-
ExplainPropertyLong("Hash Batches",hashtable->nbatch,es);
2422+
hinstrument->nbuckets_original,es);
2423+
ExplainPropertyLong("Hash Batches",hinstrument->nbatch,es);
23962424
ExplainPropertyLong("Original Hash Batches",
2397-
hashtable->nbatch_original,es);
2425+
hinstrument->nbatch_original,es);
23982426
ExplainPropertyLong("Peak Memory Usage",spacePeakKb,es);
23992427
}
2400-
elseif (hashtable->nbatch_original!=hashtable->nbatch||
2401-
hashtable->nbuckets_original!=hashtable->nbuckets)
2428+
elseif (hinstrument->nbatch_original!=hinstrument->nbatch||
2429+
hinstrument->nbuckets_original!=hinstrument->nbuckets)
24022430
{
24032431
appendStringInfoSpaces(es->str,es->indent*2);
24042432
appendStringInfo(es->str,
24052433
"Buckets: %d (originally %d) Batches: %d (originally %d) Memory Usage: %ldkB\n",
2406-
hashtable->nbuckets,
2407-
hashtable->nbuckets_original,
2408-
hashtable->nbatch,
2409-
hashtable->nbatch_original,
2434+
hinstrument->nbuckets,
2435+
hinstrument->nbuckets_original,
2436+
hinstrument->nbatch,
2437+
hinstrument->nbatch_original,
24102438
spacePeakKb);
24112439
}
24122440
else
24132441
{
24142442
appendStringInfoSpaces(es->str,es->indent*2);
24152443
appendStringInfo(es->str,
24162444
"Buckets: %d Batches: %d Memory Usage: %ldkB\n",
2417-
hashtable->nbuckets,hashtable->nbatch,
2445+
hinstrument->nbuckets,hinstrument->nbatch,
24182446
spacePeakKb);
24192447
}
24202448
}

‎src/backend/executor/execParallel.c

Lines changed: 33 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include"executor/nodeBitmapHeapscan.h"
3030
#include"executor/nodeCustom.h"
3131
#include"executor/nodeForeignscan.h"
32+
#include"executor/nodeHash.h"
3233
#include"executor/nodeIndexscan.h"
3334
#include"executor/nodeIndexonlyscan.h"
3435
#include"executor/nodeSeqscan.h"
@@ -259,8 +260,12 @@ ExecParallelEstimate(PlanState *planstate, ExecParallelEstimateContext *e)
259260
ExecBitmapHeapEstimate((BitmapHeapScanState*)planstate,
260261
e->pcxt);
261262
break;
263+
caseT_HashState:
264+
/* even when not parallel-aware, for EXPLAIN ANALYZE */
265+
ExecHashEstimate((HashState*)planstate,e->pcxt);
266+
break;
262267
caseT_SortState:
263-
/* even when not parallel-aware */
268+
/* even when not parallel-aware, for EXPLAIN ANALYZE */
264269
ExecSortEstimate((SortState*)planstate,e->pcxt);
265270
break;
266271

@@ -458,8 +463,12 @@ ExecParallelInitializeDSM(PlanState *planstate,
458463
ExecBitmapHeapInitializeDSM((BitmapHeapScanState*)planstate,
459464
d->pcxt);
460465
break;
466+
caseT_HashState:
467+
/* even when not parallel-aware, for EXPLAIN ANALYZE */
468+
ExecHashInitializeDSM((HashState*)planstate,d->pcxt);
469+
break;
461470
caseT_SortState:
462-
/* even when not parallel-aware */
471+
/* even when not parallel-aware, for EXPLAIN ANALYZE */
463472
ExecSortInitializeDSM((SortState*)planstate,d->pcxt);
464473
break;
465474

@@ -872,8 +881,12 @@ ExecParallelReInitializeDSM(PlanState *planstate,
872881
ExecBitmapHeapReInitializeDSM((BitmapHeapScanState*)planstate,
873882
pcxt);
874883
break;
884+
caseT_HashState:
885+
/* even when not parallel-aware, for EXPLAIN ANALYZE */
886+
ExecHashReInitializeDSM((HashState*)planstate,pcxt);
887+
break;
875888
caseT_SortState:
876-
/* even when not parallel-aware */
889+
/* even when not parallel-aware, for EXPLAIN ANALYZE */
877890
ExecSortReInitializeDSM((SortState*)planstate,pcxt);
878891
break;
879892

@@ -928,12 +941,18 @@ ExecParallelRetrieveInstrumentation(PlanState *planstate,
928941
planstate->worker_instrument->num_workers=instrumentation->num_workers;
929942
memcpy(&planstate->worker_instrument->instrument,instrument,ibytes);
930943

931-
/*
932-
* Perform any node-type-specific work that needs to be done. Currently,
933-
* only Sort nodes need to do anything here.
934-
*/
935-
if (IsA(planstate,SortState))
936-
ExecSortRetrieveInstrumentation((SortState*)planstate);
944+
/* Perform any node-type-specific work that needs to be done. */
945+
switch (nodeTag(planstate))
946+
{
947+
caseT_SortState:
948+
ExecSortRetrieveInstrumentation((SortState*)planstate);
949+
break;
950+
caseT_HashState:
951+
ExecHashRetrieveInstrumentation((HashState*)planstate);
952+
break;
953+
default:
954+
break;
955+
}
937956

938957
returnplanstate_tree_walker(planstate,ExecParallelRetrieveInstrumentation,
939958
instrumentation);
@@ -1160,8 +1179,12 @@ ExecParallelInitializeWorker(PlanState *planstate, ParallelWorkerContext *pwcxt)
11601179
ExecBitmapHeapInitializeWorker((BitmapHeapScanState*)planstate,
11611180
pwcxt);
11621181
break;
1182+
caseT_HashState:
1183+
/* even when not parallel-aware, for EXPLAIN ANALYZE */
1184+
ExecHashInitializeWorker((HashState*)planstate,pwcxt);
1185+
break;
11631186
caseT_SortState:
1164-
/* even when not parallel-aware */
1187+
/* even when not parallel-aware, for EXPLAIN ANALYZE */
11651188
ExecSortInitializeWorker((SortState*)planstate,pwcxt);
11661189
break;
11671190

‎src/backend/executor/execProcnode.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -751,6 +751,9 @@ ExecShutdownNode(PlanState *node)
751751
caseT_GatherMergeState:
752752
ExecShutdownGatherMerge((GatherMergeState*)node);
753753
break;
754+
caseT_HashState:
755+
ExecShutdownHash((HashState*)node);
756+
break;
754757
default:
755758
break;
756759
}

‎src/backend/executor/nodeHash.c

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1637,6 +1637,110 @@ ExecHashRemoveNextSkewBucket(HashJoinTable hashtable)
16371637
}
16381638
}
16391639

1640+
/*
1641+
* Reserve space in the DSM segment for instrumentation data.
1642+
*/
1643+
void
1644+
ExecHashEstimate(HashState*node,ParallelContext*pcxt)
1645+
{
1646+
size_tsize;
1647+
1648+
size=mul_size(pcxt->nworkers,sizeof(HashInstrumentation));
1649+
size=add_size(size, offsetof(SharedHashInfo,hinstrument));
1650+
shm_toc_estimate_chunk(&pcxt->estimator,size);
1651+
shm_toc_estimate_keys(&pcxt->estimator,1);
1652+
}
1653+
1654+
/*
1655+
* Set up a space in the DSM for all workers to record instrumentation data
1656+
* about their hash table.
1657+
*/
1658+
void
1659+
ExecHashInitializeDSM(HashState*node,ParallelContext*pcxt)
1660+
{
1661+
size_tsize;
1662+
1663+
size= offsetof(SharedHashInfo,hinstrument)+
1664+
pcxt->nworkers*sizeof(HashInstrumentation);
1665+
node->shared_info= (SharedHashInfo*)shm_toc_allocate(pcxt->toc,size);
1666+
memset(node->shared_info,0,size);
1667+
node->shared_info->num_workers=pcxt->nworkers;
1668+
shm_toc_insert(pcxt->toc,node->ps.plan->plan_node_id,
1669+
node->shared_info);
1670+
}
1671+
1672+
/*
1673+
* Reset shared state before beginning a fresh scan.
1674+
*/
1675+
void
1676+
ExecHashReInitializeDSM(HashState*node,ParallelContext*pcxt)
1677+
{
1678+
if (node->shared_info!=NULL)
1679+
{
1680+
memset(node->shared_info->hinstrument,0,
1681+
node->shared_info->num_workers*sizeof(HashInstrumentation));
1682+
}
1683+
}
1684+
1685+
/*
1686+
* Locate the DSM space for hash table instrumentation data that we'll write
1687+
* to at shutdown time.
1688+
*/
1689+
void
1690+
ExecHashInitializeWorker(HashState*node,ParallelWorkerContext*pwcxt)
1691+
{
1692+
SharedHashInfo*shared_info;
1693+
1694+
shared_info= (SharedHashInfo*)
1695+
shm_toc_lookup(pwcxt->toc,node->ps.plan->plan_node_id, true);
1696+
node->hinstrument=&shared_info->hinstrument[ParallelWorkerNumber];
1697+
}
1698+
1699+
/*
1700+
* Copy instrumentation data from this worker's hash table (if it built one)
1701+
* to DSM memory so the leader can retrieve it. This must be done in an
1702+
* ExecShutdownHash() rather than ExecEndHash() because the latter runs after
1703+
* we've detached from the DSM segment.
1704+
*/
1705+
void
1706+
ExecShutdownHash(HashState*node)
1707+
{
1708+
if (node->hinstrument&&node->hashtable)
1709+
ExecHashGetInstrumentation(node->hinstrument,node->hashtable);
1710+
}
1711+
1712+
/*
1713+
* Retrieve instrumentation data from workers before the DSM segment is
1714+
* detached, so that EXPLAIN can access it.
1715+
*/
1716+
void
1717+
ExecHashRetrieveInstrumentation(HashState*node)
1718+
{
1719+
SharedHashInfo*shared_info=node->shared_info;
1720+
size_tsize;
1721+
1722+
/* Replace node->shared_info with a copy in backend-local memory. */
1723+
size= offsetof(SharedHashInfo,hinstrument)+
1724+
shared_info->num_workers*sizeof(HashInstrumentation);
1725+
node->shared_info=palloc(size);
1726+
memcpy(node->shared_info,shared_info,size);
1727+
}
1728+
1729+
/*
1730+
* Copy the instrumentation data from 'hashtable' into a HashInstrumentation
1731+
* struct.
1732+
*/
1733+
void
1734+
ExecHashGetInstrumentation(HashInstrumentation*instrument,
1735+
HashJoinTablehashtable)
1736+
{
1737+
instrument->nbuckets=hashtable->nbuckets;
1738+
instrument->nbuckets_original=hashtable->nbuckets_original;
1739+
instrument->nbatch=hashtable->nbatch;
1740+
instrument->nbatch_original=hashtable->nbatch_original;
1741+
instrument->space_peak=hashtable->spacePeak;
1742+
}
1743+
16401744
/*
16411745
* Allocate 'size' bytes from the currently active HashMemoryChunk
16421746
*/

‎src/include/executor/nodeHash.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#ifndefNODEHASH_H
1515
#defineNODEHASH_H
1616

17+
#include"access/parallel.h"
1718
#include"nodes/execnodes.h"
1819

1920
externHashState*ExecInitHash(Hash*node,EState*estate,inteflags);
@@ -48,5 +49,13 @@ extern void ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
4849
int*numbatches,
4950
int*num_skew_mcvs);
5051
externintExecHashGetSkewBucket(HashJoinTablehashtable,uint32hashvalue);
52+
externvoidExecHashEstimate(HashState*node,ParallelContext*pcxt);
53+
externvoidExecHashInitializeDSM(HashState*node,ParallelContext*pcxt);
54+
externvoidExecHashInitializeWorker(HashState*node,ParallelWorkerContext*pwcxt);
55+
externvoidExecHashReInitializeDSM(HashState*node,ParallelContext*pcxt);
56+
externvoidExecHashRetrieveInstrumentation(HashState*node);
57+
externvoidExecShutdownHash(HashState*node);
58+
externvoidExecHashGetInstrumentation(HashInstrumentation*instrument,
59+
HashJoinTablehashtable);
5160

5261
#endif/* NODEHASH_H */

‎src/include/nodes/execnodes.h

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1980,6 +1980,29 @@ typedef struct GatherMergeState
19801980
structbinaryheap*gm_heap;/* binary heap of slot indices */
19811981
}GatherMergeState;
19821982

1983+
/* ----------------
1984+
* Values displayed by EXPLAIN ANALYZE
1985+
* ----------------
1986+
*/
1987+
typedefstructHashInstrumentation
1988+
{
1989+
intnbuckets;/* number of buckets at end of execution */
1990+
intnbuckets_original;/* planned number of buckets */
1991+
intnbatch;/* number of batches at end of execution */
1992+
intnbatch_original;/* planned number of batches */
1993+
size_tspace_peak;/* speak memory usage in bytes */
1994+
}HashInstrumentation;
1995+
1996+
/* ----------------
1997+
* Shared memory container for per-worker hash information
1998+
* ----------------
1999+
*/
2000+
typedefstructSharedHashInfo
2001+
{
2002+
intnum_workers;
2003+
HashInstrumentationhinstrument[FLEXIBLE_ARRAY_MEMBER];
2004+
}SharedHashInfo;
2005+
19832006
/* ----------------
19842007
* HashState information
19852008
* ----------------
@@ -1990,6 +2013,9 @@ typedef struct HashState
19902013
HashJoinTablehashtable;/* hash table for the hashjoin */
19912014
List*hashkeys;/* list of ExprState nodes */
19922015
/* hashkeys is same as parent's hj_InnerHashKeys */
2016+
2017+
SharedHashInfo*shared_info;/* one entry per worker */
2018+
HashInstrumentation*hinstrument;/* this worker's entry */
19932019
}HashState;
19942020

19952021
/* ----------------

‎src/test/regress/expected/join.out

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6173,6 +6173,21 @@ $$);
61736173

61746174
rollback to settings;
61756175
-- A couple of other hash join tests unrelated to work_mem management.
6176+
-- Check that EXPLAIN ANALYZE has data even if the leader doesn't participate
6177+
savepoint settings;
6178+
set local max_parallel_workers_per_gather = 2;
6179+
set local work_mem = '4MB';
6180+
set local parallel_leader_participation = off;
6181+
select * from hash_join_batches(
6182+
$$
6183+
select count(*) from simple r join simple s using (id);
6184+
$$);
6185+
original | final
6186+
----------+-------
6187+
1 | 1
6188+
(1 row)
6189+
6190+
rollback to settings;
61766191
-- A full outer join where every record is matched.
61776192
-- non-parallel
61786193
savepoint settings;

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp