Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit849074f

Browse files
committed
Revise hash join code so that we can increase the number of batches
on-the-fly, and thereby avoid blowing out memory when the planner hasunderestimated the hash table size. Hash join will now obey thework_mem limit with some faithfulness. Per my recent proposal(hash aggregate part isn't done yet though).
1 parent31b6d84 commit849074f

File tree

9 files changed

+632
-428
lines changed

9 files changed

+632
-428
lines changed

‎src/backend/executor/nodeHash.c

Lines changed: 313 additions & 198 deletions
Large diffs are not rendered by default.

‎src/backend/executor/nodeHashjoin.c

Lines changed: 228 additions & 165 deletions
Large diffs are not rendered by default.

‎src/backend/optimizer/path/costsize.c

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@
4949
* Portions Copyright (c) 1994, Regents of the University of California
5050
*
5151
* IDENTIFICATION
52-
* $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.137 2004/12/31 22:00:04pgsql Exp $
52+
* $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.138 2005/03/06 22:15:04tgl Exp $
5353
*
5454
*-------------------------------------------------------------------------
5555
*/
@@ -1074,9 +1074,9 @@ cost_hashjoin(HashPath *path, Query *root)
10741074
doubleinnerbytes=relation_byte_size(inner_path_rows,
10751075
inner_path->parent->width);
10761076
intnum_hashclauses=list_length(hashclauses);
1077-
intvirtualbuckets;
1078-
intphysicalbuckets;
1077+
intnumbuckets;
10791078
intnumbatches;
1079+
doublevirtualbuckets;
10801080
Selectivityinnerbucketsize;
10811081
Selectivityjoininfactor;
10821082
ListCell*hcl;
@@ -1123,9 +1123,9 @@ cost_hashjoin(HashPath *path, Query *root)
11231123
/* Get hash table size that executor would use for inner relation */
11241124
ExecChooseHashTableSize(inner_path_rows,
11251125
inner_path->parent->width,
1126-
&virtualbuckets,
1127-
&physicalbuckets,
1126+
&numbuckets,
11281127
&numbatches);
1128+
virtualbuckets= (double)numbuckets* (double)numbatches;
11291129

11301130
/*
11311131
* Determine bucketsize fraction for inner relation. We use the
@@ -1196,13 +1196,13 @@ cost_hashjoin(HashPath *path, Query *root)
11961196
}
11971197

11981198
/*
1199-
*if inner relation is too big then we will need to "batch" the join,
1199+
*If inner relation is too big then we will need to "batch" the join,
12001200
* which implies writing and reading most of the tuples to disk an
12011201
* extra time.Charge one cost unit per page of I/O (correct since it
12021202
* should be nice and sequential...). Writing the inner rel counts as
12031203
* startup cost, all the rest as run cost.
12041204
*/
1205-
if (numbatches)
1205+
if (numbatches>1)
12061206
{
12071207
doubleouterpages=page_size(outer_path_rows,
12081208
outer_path->parent->width);
@@ -1228,7 +1228,9 @@ cost_hashjoin(HashPath *path, Query *root)
12281228
* The number of tuple comparisons needed is the number of outer
12291229
* tuples times the typical number of tuples in a hash bucket, which
12301230
* is the inner relation size times its bucketsize fraction. At each
1231-
* one, we need to evaluate the hashjoin quals.
1231+
* one, we need to evaluate the hashjoin quals. (Note: charging the
1232+
* full qual eval cost at each tuple is pessimistic, since we don't
1233+
* evaluate the quals unless the hash values match exactly.)
12321234
*/
12331235
startup_cost+=hash_qual_cost.startup;
12341236
run_cost+=hash_qual_cost.per_tuple*

‎src/backend/utils/adt/selfuncs.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
*
1616
*
1717
* IDENTIFICATION
18-
* $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.171 2005/02/01 23:07:58 tgl Exp $
18+
* $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.172 2005/03/06 22:15:04 tgl Exp $
1919
*
2020
*-------------------------------------------------------------------------
2121
*/
@@ -2154,7 +2154,7 @@ estimate_num_groups(Query *root, List *groupExprs, double input_rows)
21542154
* inner rel is well-dispersed (or the alternatives seem much worse).
21552155
*/
21562156
Selectivity
2157-
estimate_hash_bucketsize(Query*root,Node*hashkey,intnbuckets)
2157+
estimate_hash_bucketsize(Query*root,Node*hashkey,doublenbuckets)
21582158
{
21592159
VariableStatDatavardata;
21602160
doubleestfract,
@@ -2212,8 +2212,8 @@ estimate_hash_bucketsize(Query *root, Node *hashkey, int nbuckets)
22122212
* the number of buckets is less than the expected number of distinct
22132213
* values; otherwise it is 1/ndistinct.
22142214
*/
2215-
if (ndistinct>(double)nbuckets)
2216-
estfract=1.0 /(double)nbuckets;
2215+
if (ndistinct>nbuckets)
2216+
estfract=1.0 /nbuckets;
22172217
else
22182218
estfract=1.0 /ndistinct;
22192219

‎src/include/executor/hashjoin.h

Lines changed: 45 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
88
* Portions Copyright (c) 1994, Regents of the University of California
99
*
10-
* $PostgreSQL: pgsql/src/include/executor/hashjoin.h,v 1.34 2004/12/31 22:03:29 pgsql Exp $
10+
* $PostgreSQL: pgsql/src/include/executor/hashjoin.h,v 1.35 2005/03/06 22:15:05 tgl Exp $
1111
*
1212
*-------------------------------------------------------------------------
1313
*/
@@ -20,11 +20,12 @@
2020
/* ----------------------------------------------------------------
2121
*hash-join hash table structures
2222
*
23-
* Each active hashjoin has a HashJoinTable control block which is
23+
* Each active hashjoin has a HashJoinTable control block, which is
2424
* palloc'd in the executor's per-query context. All other storage needed
2525
* for the hashjoin is kept in private memory contexts, two for each hashjoin.
2626
* This makes it easy and fast to release the storage when we don't need it
27-
* anymore.
27+
* anymore. (Exception: data associated with the temp files lives in the
28+
* per-query context too, since we always call buffile.c in that context.)
2829
*
2930
* The hashtable contexts are made children of the per-query context, ensuring
3031
* that they will be discarded at end of statement even if the join is
@@ -35,40 +36,64 @@
3536
* "hashCxt", while storage that is only wanted for the current batch is
3637
* allocated in the "batchCxt". By resetting the batchCxt at the end of
3738
* each batch, we free all the per-batch storage reliably and without tedium.
39+
*
40+
* During first scan of inner relation, we get its tuples from executor.
41+
* If nbatch > 1 then tuples that don't belong in first batch get saved
42+
* into inner-batch temp files. The same statements apply for the
43+
* first scan of the outer relation, except we write tuples to outer-batch
44+
* temp files. After finishing the first scan, we do the following for
45+
* each remaining batch:
46+
*1. Read tuples from inner batch file, load into hash buckets.
47+
*2. Read tuples from outer batch file, match to hash buckets and output.
48+
*
49+
* It is possible to increase nbatch on the fly if the in-memory hash table
50+
* gets too big. The hash-value-to-batch computation is arranged so that this
51+
* can only cause a tuple to go into a later batch than previously thought,
52+
* never into an earlier batch. When we increase nbatch, we rescan the hash
53+
* table and dump out any tuples that are now of a later batch to the correct
54+
* inner batch file. Subsequently, while reading either inner or outer batch
55+
* files, we might find tuples that no longer belong to the current batch;
56+
* if so, we just dump them out to the correct batch file.
3857
* ----------------------------------------------------------------
3958
*/
4059

60+
/* these are in nodes/execnodes.h: */
61+
/* typedef struct HashJoinTupleData *HashJoinTuple; */
62+
/* typedef struct HashJoinTableData *HashJoinTable; */
63+
4164
typedefstructHashJoinTupleData
4265
{
43-
structHashJoinTupleData*next;/* link to next tuple in same
44-
* bucket */
66+
structHashJoinTupleData*next;/* link to next tuple in same bucket */
67+
uint32hashvalue;/* tuple's hash code */
4568
HeapTupleDatahtup;/* tuple header */
4669
}HashJoinTupleData;
4770

48-
typedefHashJoinTupleData*HashJoinTuple;
49-
5071
typedefstructHashJoinTableData
5172
{
52-
intnbuckets;/* buckets inuse during this batch */
53-
inttotalbuckets;/*total numberof(virtual) buckets */
54-
HashJoinTuple*buckets;/*buckets[i] is head of list of tuples */
73+
intnbuckets;/*#buckets inthe in-memory hash table */
74+
/*buckets[i] is headoflist of tuples in i'th in-memory bucket */
75+
structHashJoinTupleData**buckets;
5576
/* buckets array is per-batch storage, as are all the tuples */
5677

57-
intnbatch;/* number of batches; 0 means 1-pass join */
58-
intcurbatch;/* current batch #, or 0 during 1st pass */
78+
intnbatch;/* number of batches */
79+
intcurbatch;/* current batch #; 0 during 1st pass */
80+
81+
intnbatch_original;/* nbatch when we started inner scan */
82+
intnbatch_outstart;/* nbatch when we started outer scan */
83+
84+
boolgrowEnabled;/* flag to shut off nbatch increases */
5985

6086
boolhashNonEmpty;/* did inner plan produce any rows? */
6187

6288
/*
63-
* all these arrays are allocated for the life of the hash join, but
64-
* only if nbatch > 0:
89+
* These arrays are allocated for the life of the hash join, but
90+
* only if nbatch > 1. A file is opened only when we first write
91+
* a tuple into it (otherwise its pointer remains NULL). Note that
92+
* the zero'th array elements never get used, since we will process
93+
* rather than dump out any tuples of batch zero.
6594
*/
6695
BufFile**innerBatchFile;/* buffered virtual temp file per batch */
6796
BufFile**outerBatchFile;/* buffered virtual temp file per batch */
68-
long*outerBatchSize;/* count of tuples in each outer batch
69-
* file */
70-
long*innerBatchSize;/* count of tuples in each inner batch
71-
* file */
7297

7398
/*
7499
* Info about the datatype-specific hash functions for the datatypes
@@ -79,21 +104,11 @@ typedef struct HashJoinTableData
79104
*/
80105
FmgrInfo*hashfunctions;/* lookup data for hash functions */
81106

82-
/*
83-
* During 1st scan of inner relation, we get tuples from executor. If
84-
* nbatch > 0 then tuples that don't belong in first nbuckets logical
85-
* buckets get dumped into inner-batch temp files. The same statements
86-
* apply for the 1st scan of the outer relation, except we write
87-
* tuples to outer-batch temp files. If nbatch > 0 then we do the
88-
* following for each batch: 1. Read tuples from inner batch file,
89-
* load into hash buckets. 2. Read tuples from outer batch file, match
90-
* to hash buckets and output.
91-
*/
107+
SizespaceUsed;/* memory space currently used by tuples */
108+
SizespaceAllowed;/* upper limit for space used */
92109

93110
MemoryContexthashCxt;/* context for whole-hash-join storage */
94111
MemoryContextbatchCxt;/* context for this-batch-only storage */
95112
}HashJoinTableData;
96113

97-
typedefHashJoinTableData*HashJoinTable;
98-
99114
#endif/* HASHJOIN_H */

‎src/include/executor/nodeHash.h

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
88
* Portions Copyright (c) 1994, Regents of the University of California
99
*
10-
* $PostgreSQL: pgsql/src/include/executor/nodeHash.h,v 1.35 2004/12/31 22:03:29 pgsql Exp $
10+
* $PostgreSQL: pgsql/src/include/executor/nodeHash.h,v 1.36 2005/03/06 22:15:05 tgl Exp $
1111
*
1212
*-------------------------------------------------------------------------
1313
*/
@@ -25,18 +25,20 @@ extern void ExecReScanHash(HashState *node, ExprContext *exprCtxt);
2525
externHashJoinTableExecHashTableCreate(Hash*node,List*hashOperators);
2626
externvoidExecHashTableDestroy(HashJoinTablehashtable);
2727
externvoidExecHashTableInsert(HashJoinTablehashtable,
28-
ExprContext*econtext,
29-
List*hashkeys);
30-
externintExecHashGetBucket(HashJoinTablehashtable,
31-
ExprContext*econtext,
32-
List*hashkeys);
33-
externintExecHashGetBatch(intbucketno,HashJoinTablehashtable);
34-
externHeapTupleExecScanHashBucket(HashJoinState*hjstate,List*hjclauses,
35-
ExprContext*econtext);
36-
externvoidExecHashTableReset(HashJoinTablehashtable,longntuples);
28+
HeapTupletuple,
29+
uint32hashvalue);
30+
externuint32ExecHashGetHashValue(HashJoinTablehashtable,
31+
ExprContext*econtext,
32+
List*hashkeys);
33+
externvoidExecHashGetBucketAndBatch(HashJoinTablehashtable,
34+
uint32hashvalue,
35+
int*bucketno,
36+
int*batchno);
37+
externHeapTupleExecScanHashBucket(HashJoinState*hjstate,
38+
ExprContext*econtext);
39+
externvoidExecHashTableReset(HashJoinTablehashtable);
3740
externvoidExecChooseHashTableSize(doublentuples,inttupwidth,
38-
int*virtualbuckets,
39-
int*physicalbuckets,
41+
int*numbuckets,
4042
int*numbatches);
4143

4244
#endif/* NODEHASH_H */

‎src/include/executor/nodeHashjoin.h

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,29 @@
11
/*-------------------------------------------------------------------------
22
*
33
* nodeHashjoin.h
4-
*
4+
* prototypes for nodeHashjoin.c
55
*
66
*
77
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
88
* Portions Copyright (c) 1994, Regents of the University of California
99
*
10-
* $PostgreSQL: pgsql/src/include/executor/nodeHashjoin.h,v 1.28 2004/12/31 22:03:29 pgsql Exp $
10+
* $PostgreSQL: pgsql/src/include/executor/nodeHashjoin.h,v 1.29 2005/03/06 22:15:05 tgl Exp $
1111
*
1212
*-------------------------------------------------------------------------
1313
*/
1414
#ifndefNODEHASHJOIN_H
1515
#defineNODEHASHJOIN_H
1616

1717
#include"nodes/execnodes.h"
18+
#include"storage/buffile.h"
1819

1920
externintExecCountSlotsHashJoin(HashJoin*node);
2021
externHashJoinState*ExecInitHashJoin(HashJoin*node,EState*estate);
2122
externTupleTableSlot*ExecHashJoin(HashJoinState*node);
2223
externvoidExecEndHashJoin(HashJoinState*node);
2324
externvoidExecReScanHashJoin(HashJoinState*node,ExprContext*exprCtxt);
2425

25-
externvoidExecHashJoinSaveTuple(HeapTupleheapTuple,BufFile*file);
26+
externvoidExecHashJoinSaveTuple(HeapTupleheapTuple,uint32hashvalue,
27+
BufFile**fileptr);
2628

2729
#endif/* NODEHASHJOIN_H */

‎src/include/nodes/execnodes.h

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,14 @@
77
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
88
* Portions Copyright (c) 1994, Regents of the University of California
99
*
10-
* $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.122 2004/12/31 22:03:34 pgsql Exp $
10+
* $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.123 2005/03/06 22:15:05 tgl Exp $
1111
*
1212
*-------------------------------------------------------------------------
1313
*/
1414
#ifndefEXECNODES_H
1515
#defineEXECNODES_H
1616

1717
#include"access/relscan.h"
18-
#include"executor/hashjoin.h"
1918
#include"executor/tuptable.h"
2019
#include"fmgr.h"
2120
#include"nodes/bitmapset.h"
@@ -985,11 +984,13 @@ typedef struct MergeJoinState
985984
* HashJoinState information
986985
*
987986
*hj_HashTablehash table for the hashjoin
987+
*(NULL if table not built yet)
988+
*hj_CurHashValuehash value for current outer tuple
988989
*hj_CurBucketNobucket# for current outer tuple
989990
*hj_CurTuplelast inner tuple matched to current outer
990991
*tuple, or NULL if starting search
991-
*(CurBucketNo and CurTuple are meaningless
992-
*unlessOuterTupleSlot isnonempty!)
992+
*(CurHashValue,CurBucketNo and CurTuple are
993+
*undefined ifOuterTupleSlot isempty!)
993994
*hj_OuterHashKeysthe outer hash keys in the hashjoin condition
994995
*hj_InnerHashKeysthe inner hash keys in the hashjoin condition
995996
*hj_HashOperatorsthe join operators in the hashjoin condition
@@ -998,14 +999,19 @@ typedef struct MergeJoinState
998999
*hj_NullInnerTupleSlotprepared null tuple for left outer joins
9991000
*hj_NeedNewOutertrue if need new outer tuple on next call
10001001
*hj_MatchedOutertrue if found a join match for current outer
1001-
*hj_hashdonetrue if hash-table-build phase is done
10021002
* ----------------
10031003
*/
1004+
1005+
/* these structs are defined in executor/hashjoin.h: */
1006+
typedefstructHashJoinTupleData*HashJoinTuple;
1007+
typedefstructHashJoinTableData*HashJoinTable;
1008+
10041009
typedefstructHashJoinState
10051010
{
10061011
JoinStatejs;/* its first field is NodeTag */
10071012
List*hashclauses;/* list of ExprState nodes */
10081013
HashJoinTablehj_HashTable;
1014+
uint32hj_CurHashValue;
10091015
inthj_CurBucketNo;
10101016
HashJoinTuplehj_CurTuple;
10111017
List*hj_OuterHashKeys;/* list of ExprState nodes */
@@ -1016,7 +1022,6 @@ typedef struct HashJoinState
10161022
TupleTableSlot*hj_NullInnerTupleSlot;
10171023
boolhj_NeedNewOuter;
10181024
boolhj_MatchedOuter;
1019-
boolhj_hashdone;
10201025
}HashJoinState;
10211026

10221027

‎src/include/utils/selfuncs.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
99
* Portions Copyright (c) 1994, Regents of the University of California
1010
*
11-
* $PostgreSQL: pgsql/src/include/utils/selfuncs.h,v 1.21 2004/12/31 22:03:46 pgsql Exp $
11+
* $PostgreSQL: pgsql/src/include/utils/selfuncs.h,v 1.22 2005/03/06 22:15:05 tgl Exp $
1212
*
1313
*-------------------------------------------------------------------------
1414
*/
@@ -121,7 +121,7 @@ extern double estimate_num_groups(Query *root, List *groupExprs,
121121
doubleinput_rows);
122122

123123
externSelectivityestimate_hash_bucketsize(Query*root,Node*hashkey,
124-
intnbuckets);
124+
doublenbuckets);
125125

126126
externDatumbtcostestimate(PG_FUNCTION_ARGS);
127127
externDatumrtcostestimate(PG_FUNCTION_ARGS);

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp