Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit0d3b231

Browse files
committed
Further repair of eqjoinsel ndistinct-clamping logic.
Examination of examples provided by Mark Kirkwood and others has convincedme that actually commit7f3eba3 was quitea few bricks shy of a load. The useful part of that patch was clampingndistinct for the inner side of a semi or anti join, and the reason whythat's needed is that it's the only way that restriction clauseseliminating rows from the inner relation can affect the estimated size ofthe join result. I had not clearly understood why the clamping wasappropriate, and so mis-extrapolated to conclude that we should clampndistinct for the outer side too, as well as for both sides of regularjoins. These latter actions were all wrong, and are reverted with thispatch. In addition, the clamping logic is now made to affect the behaviorof both paths in eqjoinsel_semi, with or without MCV lists to compare.When we have MCVs, we suppose that the most common values are the onesthat are most likely to survive the decimation resulting from a lowerrestriction clause, so we think of the clamping as eliminating non-MCVvalues, or potentially even the least-common MCVs for the inner relation.Back-patch to 8.4, same as previous fixes in this area.
1 parent7971a57 commit0d3b231

File tree

1 file changed

+50
-58
lines changed

1 file changed

+50
-58
lines changed

‎src/backend/utils/adt/selfuncs.c

Lines changed: 50 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -142,11 +142,10 @@ static double ineq_histogram_selectivity(PlannerInfo *root,
142142
FmgrInfo*opproc,boolisgt,
143143
Datumconstval,Oidconsttype);
144144
staticdoubleeqjoinsel_inner(Oidoperator,
145-
VariableStatData*vardata1,VariableStatData*vardata2,
146-
RelOptInfo*rel1,RelOptInfo*rel2);
145+
VariableStatData*vardata1,VariableStatData*vardata2);
147146
staticdoubleeqjoinsel_semi(Oidoperator,
148147
VariableStatData*vardata1,VariableStatData*vardata2,
149-
RelOptInfo*rel1,RelOptInfo*rel2);
148+
RelOptInfo*inner_rel);
150149
staticboolconvert_to_scalar(Datumvalue,Oidvaluetypid,double*scaledvalue,
151150
Datumlobound,Datumhibound,Oidboundstypid,
152151
double*scaledlobound,double*scaledhibound);
@@ -2011,47 +2010,35 @@ eqjoinsel(PG_FUNCTION_ARGS)
20112010
VariableStatDatavardata1;
20122011
VariableStatDatavardata2;
20132012
booljoin_is_reversed;
2014-
RelOptInfo*rel1;
2015-
RelOptInfo*rel2;
2013+
RelOptInfo*inner_rel;
20162014

20172015
get_join_variables(root,args,sjinfo,
20182016
&vardata1,&vardata2,&join_is_reversed);
20192017

2020-
/*
2021-
* Identify the join's direct input relations. We use the min lefthand
2022-
* and min righthand as the inputs, even though the join might actually
2023-
* get done with larger input relations. The min inputs are guaranteed to
2024-
* have been formed by now, though, and always using them ensures
2025-
* consistency of estimates.
2026-
*/
2027-
if (!join_is_reversed)
2028-
{
2029-
rel1=find_join_input_rel(root,sjinfo->min_lefthand);
2030-
rel2=find_join_input_rel(root,sjinfo->min_righthand);
2031-
}
2032-
else
2033-
{
2034-
rel1=find_join_input_rel(root,sjinfo->min_righthand);
2035-
rel2=find_join_input_rel(root,sjinfo->min_lefthand);
2036-
}
2037-
20382018
switch (sjinfo->jointype)
20392019
{
20402020
caseJOIN_INNER:
20412021
caseJOIN_LEFT:
20422022
caseJOIN_FULL:
2043-
selec=eqjoinsel_inner(operator,&vardata1,&vardata2,
2044-
rel1,rel2);
2023+
selec=eqjoinsel_inner(operator,&vardata1,&vardata2);
20452024
break;
20462025
caseJOIN_SEMI:
20472026
caseJOIN_ANTI:
2027+
/*
2028+
* Look up the join's inner relation. min_righthand is sufficient
2029+
* information because neither SEMI nor ANTI joins permit any
2030+
* reassociation into or out of their RHS, so the righthand will
2031+
* always be exactly that set of rels.
2032+
*/
2033+
inner_rel=find_join_input_rel(root,sjinfo->min_righthand);
2034+
20482035
if (!join_is_reversed)
20492036
selec=eqjoinsel_semi(operator,&vardata1,&vardata2,
2050-
rel1,rel2);
2037+
inner_rel);
20512038
else
20522039
selec=eqjoinsel_semi(get_commutator(operator),
20532040
&vardata2,&vardata1,
2054-
rel2,rel1);
2041+
inner_rel);
20552042
break;
20562043
default:
20572044
/* other values not expected here */
@@ -2077,8 +2064,7 @@ eqjoinsel(PG_FUNCTION_ARGS)
20772064
*/
20782065
staticdouble
20792066
eqjoinsel_inner(Oidoperator,
2080-
VariableStatData*vardata1,VariableStatData*vardata2,
2081-
RelOptInfo*rel1,RelOptInfo*rel2)
2067+
VariableStatData*vardata1,VariableStatData*vardata2)
20822068
{
20832069
doubleselec;
20842070
doublend1;
@@ -2273,26 +2259,10 @@ eqjoinsel_inner(Oid operator,
22732259
* XXX Can we be smarter if we have an MCV list for just one side? It
22742260
* seems that if we assume equal distribution for the other side, we
22752261
* end up with the same answer anyway.
2276-
*
2277-
* An additional hack we use here is to clamp the nd1 and nd2 values
2278-
* to not more than what we are estimating the input relation sizes to
2279-
* be, providing a crude correction for the selectivity of restriction
2280-
* clauses on those relations.(We don't do that in the other path
2281-
* since there we are comparing the nd values to stats for the whole
2282-
* relations.) We can apply this clamp both with respect to the base
2283-
* relations from which the join variables come, and to the immediate
2284-
* input relations of the current join.
22852262
*/
22862263
doublenullfrac1=stats1 ?stats1->stanullfrac :0.0;
22872264
doublenullfrac2=stats2 ?stats2->stanullfrac :0.0;
22882265

2289-
if (vardata1->rel)
2290-
nd1=Min(nd1,vardata1->rel->rows);
2291-
nd1=Min(nd1,rel1->rows);
2292-
if (vardata2->rel)
2293-
nd2=Min(nd2,vardata2->rel->rows);
2294-
nd2=Min(nd2,rel2->rows);
2295-
22962266
selec= (1.0-nullfrac1)* (1.0-nullfrac2);
22972267
if (nd1>nd2)
22982268
selec /=nd1;
@@ -2319,7 +2289,7 @@ eqjoinsel_inner(Oid operator,
23192289
staticdouble
23202290
eqjoinsel_semi(Oidoperator,
23212291
VariableStatData*vardata1,VariableStatData*vardata2,
2322-
RelOptInfo*rel1,RelOptInfo*rel2)
2292+
RelOptInfo*inner_rel)
23232293
{
23242294
doubleselec;
23252295
doublend1;
@@ -2339,6 +2309,25 @@ eqjoinsel_semi(Oid operator,
23392309
nd1=get_variable_numdistinct(vardata1);
23402310
nd2=get_variable_numdistinct(vardata2);
23412311

2312+
/*
2313+
* We clamp nd2 to be not more than what we estimate the inner relation's
2314+
* size to be. This is intuitively somewhat reasonable since obviously
2315+
* there can't be more than that many distinct values coming from the
2316+
* inner rel. The reason for the asymmetry (ie, that we don't clamp nd1
2317+
* likewise) is that this is the only pathway by which restriction clauses
2318+
* applied to the inner rel will affect the join result size estimate,
2319+
* since set_joinrel_size_estimates will multiply SEMI/ANTI selectivity by
2320+
* only the outer rel's size. If we clamped nd1 we'd be double-counting
2321+
* the selectivity of outer-rel restrictions.
2322+
*
2323+
* We can apply this clamping both with respect to the base relation from
2324+
* which the join variable comes (if there is just one), and to the
2325+
* immediate inner input relation of the current join.
2326+
*/
2327+
if (vardata2->rel)
2328+
nd2=Min(nd2,vardata2->rel->rows);
2329+
nd2=Min(nd2,inner_rel->rows);
2330+
23422331
if (HeapTupleIsValid(vardata1->statsTuple))
23432332
{
23442333
stats1= (Form_pg_statistic)GETSTRUCT(vardata1->statsTuple);
@@ -2382,11 +2371,21 @@ eqjoinsel_semi(Oid operator,
23822371
uncertainfrac,
23832372
uncertain;
23842373
inti,
2385-
nmatches;
2374+
nmatches,
2375+
clamped_nvalues2;
2376+
2377+
/*
2378+
* The clamping above could have resulted in nd2 being less than
2379+
* nvalues2; in which case, we assume that precisely the nd2 most
2380+
* common values in the relation will appear in the join input, and so
2381+
* compare to only the first nd2 members of the MCV list. Of course
2382+
* this is frequently wrong, but it's the best bet we can make.
2383+
*/
2384+
clamped_nvalues2=Min(nvalues2,nd2);
23862385

23872386
fmgr_info(get_opcode(operator),&eqproc);
23882387
hasmatch1= (bool*)palloc0(nvalues1*sizeof(bool));
2389-
hasmatch2= (bool*)palloc0(nvalues2*sizeof(bool));
2388+
hasmatch2= (bool*)palloc0(clamped_nvalues2*sizeof(bool));
23902389

23912390
/*
23922391
* Note we assume that each MCV will match at most one member of the
@@ -2399,7 +2398,7 @@ eqjoinsel_semi(Oid operator,
23992398
{
24002399
intj;
24012400

2402-
for (j=0;j<nvalues2;j++)
2401+
for (j=0;j<clamped_nvalues2;j++)
24032402
{
24042403
if (hasmatch2[j])
24052404
continue;
@@ -2444,7 +2443,7 @@ eqjoinsel_semi(Oid operator,
24442443
{
24452444
nd1-=nmatches;
24462445
nd2-=nmatches;
2447-
if (nd1 <=nd2||nd2 <=0)
2446+
if (nd1 <=nd2||nd2<0)
24482447
uncertainfrac=1.0;
24492448
else
24502449
uncertainfrac=nd2 /nd1;
@@ -2465,14 +2464,7 @@ eqjoinsel_semi(Oid operator,
24652464

24662465
if (nd1!=DEFAULT_NUM_DISTINCT&&nd2!=DEFAULT_NUM_DISTINCT)
24672466
{
2468-
if (vardata1->rel)
2469-
nd1=Min(nd1,vardata1->rel->rows);
2470-
nd1=Min(nd1,rel1->rows);
2471-
if (vardata2->rel)
2472-
nd2=Min(nd2,vardata2->rel->rows);
2473-
nd2=Min(nd2,rel2->rows);
2474-
2475-
if (nd1 <=nd2||nd2 <=0)
2467+
if (nd1 <=nd2||nd2<0)
24762468
selec=1.0-nullfrac1;
24772469
else
24782470
selec= (nd2 /nd1)* (1.0-nullfrac1);

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp