Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitb4b0079

Browse files
committed
Use extended statistics for precise estimation of bucket size in hash join.
Recognizing the real-life complexity where columns in the table often havefunctional dependencies, PostgreSQL's estimation of the number of distinctvalues over a set of columns can be underestimated (or much rarely,overestimated) when dealing with multi-clause JOIN.In the case of hash join, it can end up with a small number of predicted hashbuckets and, as a result, picking non-optimal merge join.To improve the situation, we introduce one additional stage of bucket sizeestimation - having two or more join clauses estimator lookup for extendedstatistics and use it for multicolumn estimation.Clauses are grouped into lists, each containing expressions referencing thesame relation. The result of the multicolumn estimation made over such a listis combined with others according to the caller's logic.Clauses that are not estimated are returned to the caller for furtherestimation.
1 parent65b71de commitb4b0079

File tree

3 files changed

+186
-1
lines changed

3 files changed

+186
-1
lines changed

‎src/backend/optimizer/path/costsize.c

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4250,9 +4250,19 @@ final_cost_hashjoin(PlannerInfo *root, HashPath *path,
42504250
}
42514251
else
42524252
{
4253+
List*otherclauses;
4254+
42534255
innerbucketsize=1.0;
42544256
innermcvfreq=1.0;
4255-
foreach(hcl,hashclauses)
4257+
4258+
/* At first, try to estimate bucket size using extended statistics. */
4259+
otherclauses=estimate_multivariate_bucketsize(root,
4260+
inner_path->parent,
4261+
hashclauses,
4262+
&innerbucketsize);
4263+
4264+
/* Pass through the remaining clauses */
4265+
foreach(hcl,otherclauses)
42564266
{
42574267
RestrictInfo*restrictinfo=lfirst_node(RestrictInfo,hcl);
42584268
Selectivitythisbucketsize;

‎src/backend/utils/adt/selfuncs.c

Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3751,6 +3751,177 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows,
37513751
returnnumdistinct;
37523752
}
37533753

3754+
/*
3755+
* Try to estimate the bucket size of the hash join inner side when the join
3756+
* condition contains two or more clauses by employing extended statistics.
3757+
*
3758+
* The main idea of this approach is that the distinct value generated by
3759+
* multivariate estimation on two or more columns would provide less bucket size
3760+
* than estimation on one separate column.
3761+
*
3762+
* IMPORTANT: It is crucial to synchronise the approach of combining different
3763+
* estimations with the caller's method.
3764+
* Return a list of clauses that didn't fetch any extended statistics.
3765+
*/
3766+
List*
3767+
estimate_multivariate_bucketsize(PlannerInfo*root,RelOptInfo*inner,
3768+
List*hashclauses,
3769+
Selectivity*innerbucketsize)
3770+
{
3771+
List*clauses=list_copy(hashclauses);
3772+
List*otherclauses=NIL;
3773+
doublendistinct=1.0;
3774+
3775+
if (list_length(hashclauses) <=1)
3776+
/*
3777+
* Nothing to do for a single clause. Could we employ univariate
3778+
* extended stat here?
3779+
*/
3780+
returnhashclauses;
3781+
3782+
while (clauses!=NIL)
3783+
{
3784+
ListCell*lc;
3785+
intrelid=-1;
3786+
List*varinfos=NIL;
3787+
List*saveList=NIL;
3788+
doublemvndistinct;
3789+
List*origin_varinfos;
3790+
intgroup_relid=-1;
3791+
RelOptInfo*group_rel;
3792+
ListCell*lc1,*lc2;
3793+
3794+
/*
3795+
* Find clauses, referencing the same single base relation and try to
3796+
* estimate such a group with extended statistics.
3797+
* Create varinfo for an approved clause, push it to otherclauses, if it
3798+
* can't be estimated here or ignore to process at the next iteration.
3799+
*/
3800+
foreach(lc,clauses)
3801+
{
3802+
RestrictInfo*rinfo=lfirst_node(RestrictInfo,lc);
3803+
Node*expr;
3804+
Relidsrelids;
3805+
GroupVarInfo*varinfo;
3806+
3807+
/*
3808+
* Find the inner side of the join which we need to estimate the
3809+
* number of buckets. Use outer_is_left because the
3810+
* clause_sides_match_join routine has called on hash clauses.
3811+
*/
3812+
relids=rinfo->outer_is_left ?
3813+
rinfo->right_relids :rinfo->left_relids;
3814+
expr=rinfo->outer_is_left ?
3815+
get_rightop(rinfo->clause) :get_leftop(rinfo->clause);
3816+
3817+
if (bms_get_singleton_member(relids,&relid)&&
3818+
root->simple_rel_array[relid]->statlist!=NIL)
3819+
{
3820+
/*
3821+
* This inner-side expression references only one relation.
3822+
* Extended statistics on this clause can exist.
3823+
*/
3824+
3825+
if (group_relid<0)
3826+
{
3827+
RangeTblEntry*rte=root->simple_rte_array[relid];
3828+
3829+
if (!rte|| (rte->relkind!=RELKIND_RELATION&&
3830+
rte->relkind!=RELKIND_MATVIEW&&
3831+
rte->relkind!=RELKIND_FOREIGN_TABLE&&
3832+
rte->relkind!=RELKIND_PARTITIONED_TABLE))
3833+
{
3834+
/* Extended statistics can't exist in principle */
3835+
otherclauses=lappend(otherclauses,rinfo);
3836+
clauses=foreach_delete_current(clauses,lc);
3837+
continue;
3838+
}
3839+
3840+
group_relid=relid;
3841+
group_rel=root->simple_rel_array[relid];
3842+
Assert(group_rel!=NULL);
3843+
}
3844+
elseif (group_relid!=relid)
3845+
/*
3846+
* Being in the group forming state we don't
3847+
* need other clauses.
3848+
*/
3849+
continue;
3850+
3851+
varinfo= (GroupVarInfo*)palloc(sizeof(GroupVarInfo));
3852+
varinfo->var=expr;
3853+
varinfo->rel=root->simple_rel_array[relid];
3854+
varinfo->ndistinct=0.0;
3855+
varinfo->isdefault= false;
3856+
varinfos=lappend(varinfos,varinfo);
3857+
3858+
/*
3859+
* Remember the link to RestrictInfo for the case the clause is
3860+
* failed to be estimated.
3861+
*/
3862+
saveList=lappend(saveList,rinfo);
3863+
}
3864+
else
3865+
/* This clause can't be estimated with extended statistics */
3866+
otherclauses=lappend(otherclauses,rinfo);
3867+
3868+
clauses=foreach_delete_current(clauses,lc);
3869+
}
3870+
3871+
if (list_length(varinfos)<2)
3872+
{
3873+
/*
3874+
* Multivariate statistics doesn't apply to single column except
3875+
* expressions, but it has not implemented yet.
3876+
*/
3877+
otherclauses=list_concat(otherclauses,saveList);
3878+
list_free_deep(varinfos);
3879+
list_free(saveList);
3880+
continue;
3881+
}
3882+
3883+
/* Let's employ extended statistics. */
3884+
origin_varinfos=varinfos;
3885+
for (;;)
3886+
{
3887+
boolestimated=estimate_multivariate_ndistinct(root,
3888+
group_rel,
3889+
&varinfos,
3890+
&mvndistinct);
3891+
if (!estimated)
3892+
break;
3893+
3894+
/*
3895+
* We've got an estimation. Use ndistinct value in consistent way -
3896+
* according to the caller's logic (see final_cost_hashjoin).
3897+
*/
3898+
if (ndistinct<mvndistinct)
3899+
ndistinct=mvndistinct;
3900+
Assert(ndistinct >=1.0);
3901+
}
3902+
3903+
Assert(list_length(origin_varinfos)==list_length(saveList));
3904+
3905+
/*
3906+
* Remove already estimated clauses from the saveList
3907+
*/
3908+
forboth(lc1,origin_varinfos,lc2,saveList)
3909+
{
3910+
GroupVarInfo*vinfo=lfirst(lc1);
3911+
3912+
if (!list_member_ptr(varinfos,vinfo))
3913+
/* Already estimated */
3914+
continue;
3915+
3916+
/* Can't be estimated here - push to the returning list */
3917+
otherclauses=lappend(otherclauses,lfirst(lc2));
3918+
}
3919+
}
3920+
3921+
*innerbucketsize=1.0 /ndistinct;
3922+
returnotherclauses;
3923+
}
3924+
37543925
/*
37553926
* Estimate hash bucket statistics when the specified expression is used
37563927
* as a hash key for the given number of buckets.

‎src/include/utils/selfuncs.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,10 @@ extern double estimate_num_groups(PlannerInfo *root, List *groupExprs,
217217
doubleinput_rows,List**pgset,
218218
EstimationInfo*estinfo);
219219

220+
externList*estimate_multivariate_bucketsize(PlannerInfo*root,
221+
RelOptInfo*inner,
222+
List*hashclauses,
223+
Selectivity*innerbucketsize);
220224
externvoidestimate_hash_bucket_stats(PlannerInfo*root,
221225
Node*hashkey,doublenbuckets,
222226
Selectivity*mcv_freq,

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp