Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit596efd2

Browse files
committed
Optimize multi-batch hash joins when the outer relation has a nonuniform
distribution, by creating a special fast path for the (first few) most commonvalues of the outer relation. Tuples having hashvalues matching the MCVsare effectively forced to be in the first batch, so that we never writethem out to the batch temp files.Bryce Cutt and Ramon Lawrence, with some editorialization by me.
1 parent249d936 commit596efd2

File tree

10 files changed

+604
-34
lines changed

10 files changed

+604
-34
lines changed

‎src/backend/executor/nodeHash.c

Lines changed: 435 additions & 10 deletions
Large diffs are not rendered by default.

‎src/backend/executor/nodeHashjoin.c

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $PostgreSQL: pgsql/src/backend/executor/nodeHashjoin.c,v 1.97 2009/01/01 17:23:41 momjian Exp $
11+
* $PostgreSQL: pgsql/src/backend/executor/nodeHashjoin.c,v 1.98 2009/03/21 00:04:38 tgl Exp $
1212
*
1313
*-------------------------------------------------------------------------
1414
*/
@@ -198,19 +198,23 @@ ExecHashJoin(HashJoinState *node)
198198
node->hj_MatchedOuter= false;
199199

200200
/*
201-
*now we have an outer tuple, find the corresponding bucket for
202-
* this tuplefrom the hash table
201+
*Now we have an outer tuple; find the corresponding bucket for
202+
* this tuplein themainhash table or skew hash table.
203203
*/
204204
node->hj_CurHashValue=hashvalue;
205205
ExecHashGetBucketAndBatch(hashtable,hashvalue,
206206
&node->hj_CurBucketNo,&batchno);
207+
node->hj_CurSkewBucketNo=ExecHashGetSkewBucket(hashtable,
208+
hashvalue);
207209
node->hj_CurTuple=NULL;
208210

209211
/*
210212
* Now we've got an outer tuple and the corresponding hash bucket,
211-
* but this tuple may not belong to the current batch.
213+
* but it might not belong to the current batch, or it might
214+
* match a skew bucket.
212215
*/
213-
if (batchno!=hashtable->curbatch)
216+
if (batchno!=hashtable->curbatch&&
217+
node->hj_CurSkewBucketNo==INVALID_SKEW_BUCKET_NO)
214218
{
215219
/*
216220
* Need to postpone this outer tuple to a later batch. Save it
@@ -452,6 +456,7 @@ ExecInitHashJoin(HashJoin *node, EState *estate, int eflags)
452456

453457
hjstate->hj_CurHashValue=0;
454458
hjstate->hj_CurBucketNo=0;
459+
hjstate->hj_CurSkewBucketNo=INVALID_SKEW_BUCKET_NO;
455460
hjstate->hj_CurTuple=NULL;
456461

457462
/*
@@ -651,6 +656,19 @@ ExecHashJoinNewBatch(HashJoinState *hjstate)
651656
BufFileClose(hashtable->outerBatchFile[curbatch]);
652657
hashtable->outerBatchFile[curbatch]=NULL;
653658
}
659+
else/* we just finished the first batch */
660+
{
661+
/*
662+
* Reset some of the skew optimization state variables, since we
663+
* no longer need to consider skew tuples after the first batch.
664+
* The memory context reset we are about to do will release the
665+
* skew hashtable itself.
666+
*/
667+
hashtable->skewEnabled= false;
668+
hashtable->skewBucket=NULL;
669+
hashtable->skewBucketNums=NULL;
670+
hashtable->spaceUsedSkew=0;
671+
}
654672

655673
/*
656674
* We can always skip over any batches that are completely empty on both
@@ -880,6 +898,7 @@ ExecReScanHashJoin(HashJoinState *node, ExprContext *exprCtxt)
880898
/* Always reset intra-tuple state */
881899
node->hj_CurHashValue=0;
882900
node->hj_CurBucketNo=0;
901+
node->hj_CurSkewBucketNo=INVALID_SKEW_BUCKET_NO;
883902
node->hj_CurTuple=NULL;
884903

885904
node->js.ps.ps_TupFromTlist= false;

‎src/backend/nodes/copyfuncs.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
* Portions Copyright (c) 1994, Regents of the University of California
1616
*
1717
* IDENTIFICATION
18-
* $PostgreSQL: pgsql/src/backend/nodes/copyfuncs.c,v 1.426 2009/03/10 22:09:25 tgl Exp $
18+
* $PostgreSQL: pgsql/src/backend/nodes/copyfuncs.c,v 1.427 2009/03/21 00:04:39 tgl Exp $
1919
*
2020
*-------------------------------------------------------------------------
2121
*/
@@ -735,6 +735,10 @@ _copyHash(Hash *from)
735735
/*
736736
* copy remainder of node
737737
*/
738+
COPY_SCALAR_FIELD(skewTable);
739+
COPY_SCALAR_FIELD(skewColumn);
740+
COPY_SCALAR_FIELD(skewColType);
741+
COPY_SCALAR_FIELD(skewColTypmod);
738742

739743
returnnewnode;
740744
}

‎src/backend/nodes/outfuncs.c

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $PostgreSQL: pgsql/src/backend/nodes/outfuncs.c,v 1.354 2009/03/10 22:09:25 tgl Exp $
11+
* $PostgreSQL: pgsql/src/backend/nodes/outfuncs.c,v 1.355 2009/03/21 00:04:39 tgl Exp $
1212
*
1313
* NOTES
1414
* Every node type that can appear in stored rules' parsetrees *must*
@@ -675,6 +675,11 @@ _outHash(StringInfo str, Hash *node)
675675
WRITE_NODE_TYPE("HASH");
676676

677677
_outPlanInfo(str, (Plan*)node);
678+
679+
WRITE_OID_FIELD(skewTable);
680+
WRITE_INT_FIELD(skewColumn);
681+
WRITE_OID_FIELD(skewColType);
682+
WRITE_INT_FIELD(skewColTypmod);
678683
}
679684

680685
staticvoid

‎src/backend/optimizer/path/costsize.c

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@
5454
* Portions Copyright (c) 1994, Regents of the University of California
5555
*
5656
* IDENTIFICATION
57-
* $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.204 2009/02/06 23:43:23 tgl Exp $
57+
* $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.205 2009/03/21 00:04:39 tgl Exp $
5858
*
5959
*-------------------------------------------------------------------------
6060
*/
@@ -1821,6 +1821,7 @@ cost_hashjoin(HashPath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo)
18211821
intnum_hashclauses=list_length(hashclauses);
18221822
intnumbuckets;
18231823
intnumbatches;
1824+
intnum_skew_mcvs;
18241825
doublevirtualbuckets;
18251826
Selectivityinnerbucketsize;
18261827
ListCell*hcl;
@@ -1862,11 +1863,22 @@ cost_hashjoin(HashPath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo)
18621863
*inner_path_rows;
18631864
run_cost+=cpu_operator_cost*num_hashclauses*outer_path_rows;
18641865

1865-
/* Get hash table size that executor would use for inner relation */
1866+
/*
1867+
* Get hash table size that executor would use for inner relation.
1868+
*
1869+
* XXX for the moment, always assume that skew optimization will be
1870+
* performed. As long as SKEW_WORK_MEM_PERCENT is small, it's not worth
1871+
* trying to determine that for sure.
1872+
*
1873+
* XXX at some point it might be interesting to try to account for skew
1874+
* optimization in the cost estimate, but for now, we don't.
1875+
*/
18661876
ExecChooseHashTableSize(inner_path_rows,
18671877
inner_path->parent->width,
1878+
true,/* useskew */
18681879
&numbuckets,
1869-
&numbatches);
1880+
&numbatches,
1881+
&num_skew_mcvs);
18701882
virtualbuckets= (double)numbuckets*(double)numbatches;
18711883

18721884
/*

‎src/backend/optimizer/plan/createplan.c

Lines changed: 58 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
*
1111
*
1212
* IDENTIFICATION
13-
* $PostgreSQL: pgsql/src/backend/optimizer/plan/createplan.c,v 1.255 2009/01/01 17:23:44 momjian Exp $
13+
* $PostgreSQL: pgsql/src/backend/optimizer/plan/createplan.c,v 1.256 2009/03/21 00:04:39 tgl Exp $
1414
*
1515
*-------------------------------------------------------------------------
1616
*/
@@ -112,7 +112,11 @@ static HashJoin *make_hashjoin(List *tlist,
112112
List*hashclauses,
113113
Plan*lefttree,Plan*righttree,
114114
JoinTypejointype);
115-
staticHash*make_hash(Plan*lefttree);
115+
staticHash*make_hash(Plan*lefttree,
116+
OidskewTable,
117+
AttrNumberskewColumn,
118+
OidskewColType,
119+
int32skewColTypmod);
116120
staticMergeJoin*make_mergejoin(List*tlist,
117121
List*joinclauses,List*otherclauses,
118122
List*mergeclauses,
@@ -1864,6 +1868,10 @@ create_hashjoin_plan(PlannerInfo *root,
18641868
List*joinclauses;
18651869
List*otherclauses;
18661870
List*hashclauses;
1871+
OidskewTable=InvalidOid;
1872+
AttrNumberskewColumn=InvalidAttrNumber;
1873+
OidskewColType=InvalidOid;
1874+
int32skewColTypmod=-1;
18671875
HashJoin*join_plan;
18681876
Hash*hash_plan;
18691877

@@ -1902,10 +1910,47 @@ create_hashjoin_plan(PlannerInfo *root,
19021910
/* We don't want any excess columns in the hashed tuples */
19031911
disuse_physical_tlist(inner_plan,best_path->jpath.innerjoinpath);
19041912

1913+
/*
1914+
* If there is a single join clause and we can identify the outer
1915+
* variable as a simple column reference, supply its identity for
1916+
* possible use in skew optimization. (Note: in principle we could
1917+
* do skew optimization with multiple join clauses, but we'd have to
1918+
* be able to determine the most common combinations of outer values,
1919+
* which we don't currently have enough stats for.)
1920+
*/
1921+
if (list_length(hashclauses)==1)
1922+
{
1923+
OpExpr*clause= (OpExpr*)linitial(hashclauses);
1924+
Node*node;
1925+
1926+
Assert(is_opclause(clause));
1927+
node= (Node*)linitial(clause->args);
1928+
if (IsA(node,RelabelType))
1929+
node= (Node*) ((RelabelType*)node)->arg;
1930+
if (IsA(node,Var))
1931+
{
1932+
Var*var= (Var*)node;
1933+
RangeTblEntry*rte;
1934+
1935+
rte=root->simple_rte_array[var->varno];
1936+
if (rte->rtekind==RTE_RELATION)
1937+
{
1938+
skewTable=rte->relid;
1939+
skewColumn=var->varattno;
1940+
skewColType=var->vartype;
1941+
skewColTypmod=var->vartypmod;
1942+
}
1943+
}
1944+
}
1945+
19051946
/*
19061947
* Build the hash node and hash join node.
19071948
*/
1908-
hash_plan=make_hash(inner_plan);
1949+
hash_plan=make_hash(inner_plan,
1950+
skewTable,
1951+
skewColumn,
1952+
skewColType,
1953+
skewColTypmod);
19091954
join_plan=make_hashjoin(tlist,
19101955
joinclauses,
19111956
otherclauses,
@@ -2713,7 +2758,11 @@ make_hashjoin(List *tlist,
27132758
}
27142759

27152760
staticHash*
2716-
make_hash(Plan*lefttree)
2761+
make_hash(Plan*lefttree,
2762+
OidskewTable,
2763+
AttrNumberskewColumn,
2764+
OidskewColType,
2765+
int32skewColTypmod)
27172766
{
27182767
Hash*node=makeNode(Hash);
27192768
Plan*plan=&node->plan;
@@ -2730,6 +2779,11 @@ make_hash(Plan *lefttree)
27302779
plan->lefttree=lefttree;
27312780
plan->righttree=NULL;
27322781

2782+
node->skewTable=skewTable;
2783+
node->skewColumn=skewColumn;
2784+
node->skewColType=skewColType;
2785+
node->skewColTypmod=skewColTypmod;
2786+
27332787
returnnode;
27342788
}
27352789

‎src/include/executor/hashjoin.h

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
88
* Portions Copyright (c) 1994, Regents of the University of California
99
*
10-
* $PostgreSQL: pgsql/src/include/executor/hashjoin.h,v 1.49 2009/01/01 17:23:59 momjian Exp $
10+
* $PostgreSQL: pgsql/src/include/executor/hashjoin.h,v 1.50 2009/03/21 00:04:40 tgl Exp $
1111
*
1212
*-------------------------------------------------------------------------
1313
*/
@@ -72,6 +72,36 @@ typedef struct HashJoinTupleData
7272
#defineHJTUPLE_MINTUPLE(hjtup) \
7373
((MinimalTuple) ((char *) (hjtup) + HJTUPLE_OVERHEAD))
7474

75+
/*
76+
* If the outer relation's distribution is sufficiently nonuniform, we attempt
77+
* to optimize the join by treating the hash values corresponding to the outer
78+
* relation's MCVs specially. Inner relation tuples matching these hash
79+
* values go into the "skew" hashtable instead of the main hashtable, and
80+
* outer relation tuples with these hash values are matched against that
81+
* table instead of the main one. Thus, tuples with these hash values are
82+
* effectively handled as part of the first batch and will never go to disk.
83+
* The skew hashtable is limited to SKEW_WORK_MEM_PERCENT of the total memory
84+
* allowed for the join; while building the hashtables, we decrease the number
85+
* of MCVs being specially treated if needed to stay under this limit.
86+
*
87+
* Note: you might wonder why we look at the outer relation stats for this,
88+
* rather than the inner. One reason is that the outer relation is typically
89+
* bigger, so we get more I/O savings by optimizing for its most common values.
90+
* Also, for similarly-sized relations, the planner prefers to put the more
91+
* uniformly distributed relation on the inside, so we're more likely to find
92+
* interesting skew in the outer relation.
93+
*/
94+
typedefstructHashSkewBucket
95+
{
96+
uint32hashvalue;/* common hash value */
97+
HashJoinTupletuples;/* linked list of inner-relation tuples */
98+
}HashSkewBucket;
99+
100+
#defineSKEW_BUCKET_OVERHEAD MAXALIGN(sizeof(HashSkewBucket))
101+
#defineINVALID_SKEW_BUCKET_NO (-1)
102+
#defineSKEW_WORK_MEM_PERCENT 2
103+
#defineSKEW_MIN_OUTER_FRACTION 0.01
104+
75105

76106
typedefstructHashJoinTableData
77107
{
@@ -82,6 +112,12 @@ typedef struct HashJoinTableData
82112
structHashJoinTupleData**buckets;
83113
/* buckets array is per-batch storage, as are all the tuples */
84114

115+
boolskewEnabled;/* are we using skew optimization? */
116+
HashSkewBucket**skewBucket;/* hashtable of skew buckets */
117+
intskewBucketLen;/* size of skewBucket array (a power of 2!) */
118+
intnSkewBuckets;/* number of active skew buckets */
119+
int*skewBucketNums;/* array indexes of active skew buckets */
120+
85121
intnbatch;/* number of batches */
86122
intcurbatch;/* current batch #; 0 during 1st pass */
87123

@@ -113,6 +149,8 @@ typedef struct HashJoinTableData
113149

114150
SizespaceUsed;/* memory space currently used by tuples */
115151
SizespaceAllowed;/* upper limit for space used */
152+
SizespaceUsedSkew;/* skew hash table's current space usage */
153+
SizespaceAllowedSkew;/* upper limit for skew hashtable */
116154

117155
MemoryContexthashCxt;/* context for whole-hash-join storage */
118156
MemoryContextbatchCxt;/* context for this-batch-only storage */

‎src/include/executor/nodeHash.h

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
88
* Portions Copyright (c) 1994, Regents of the University of California
99
*
10-
* $PostgreSQL: pgsql/src/include/executor/nodeHash.h,v 1.46 2009/01/01 17:23:59 momjian Exp $
10+
* $PostgreSQL: pgsql/src/include/executor/nodeHash.h,v 1.47 2009/03/21 00:04:40 tgl Exp $
1111
*
1212
*-------------------------------------------------------------------------
1313
*/
@@ -41,8 +41,10 @@ extern void ExecHashGetBucketAndBatch(HashJoinTable hashtable,
4141
externHashJoinTupleExecScanHashBucket(HashJoinState*hjstate,
4242
ExprContext*econtext);
4343
externvoidExecHashTableReset(HashJoinTablehashtable);
44-
externvoidExecChooseHashTableSize(doublentuples,inttupwidth,
44+
externvoidExecChooseHashTableSize(doublentuples,inttupwidth,booluseskew,
4545
int*numbuckets,
46-
int*numbatches);
46+
int*numbatches,
47+
int*num_skew_mcvs);
48+
externintExecHashGetSkewBucket(HashJoinTablehashtable,uint32hashvalue);
4749

4850
#endif/* NODEHASH_H */

‎src/include/nodes/execnodes.h

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
88
* Portions Copyright (c) 1994, Regents of the University of California
99
*
10-
* $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.201 2009/01/12 05:10:45 tgl Exp $
10+
* $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.202 2009/03/21 00:04:40 tgl Exp $
1111
*
1212
*-------------------------------------------------------------------------
1313
*/
@@ -1374,11 +1374,12 @@ typedef struct MergeJoinState
13741374
*hj_HashTablehash table for the hashjoin
13751375
*(NULL if table not built yet)
13761376
*hj_CurHashValuehash value for current outer tuple
1377-
*hj_CurBucketNobucket# for current outer tuple
1377+
*hj_CurBucketNoregular bucket# for current outer tuple
1378+
*hj_CurSkewBucketNoskew bucket# for current outer tuple
13781379
*hj_CurTuplelast inner tuple matched to current outer
13791380
*tuple, or NULL if starting search
1380-
*(CurHashValue, CurBucketNo and CurTuple are
1381-
* undefined ifOuterTupleSlot is empty!)
1381+
*(hj_CurXXX variables are undefined if
1382+
*OuterTupleSlot is empty!)
13821383
*hj_OuterHashKeysthe outer hash keys in the hashjoin condition
13831384
*hj_InnerHashKeysthe inner hash keys in the hashjoin condition
13841385
*hj_HashOperatorsthe join operators in the hashjoin condition
@@ -1403,6 +1404,7 @@ typedef struct HashJoinState
14031404
HashJoinTablehj_HashTable;
14041405
uint32hj_CurHashValue;
14051406
inthj_CurBucketNo;
1407+
inthj_CurSkewBucketNo;
14061408
HashJoinTuplehj_CurTuple;
14071409
List*hj_OuterHashKeys;/* list of ExprState nodes */
14081410
List*hj_InnerHashKeys;/* list of ExprState nodes */

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp