Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit73d1040

Browse files
committed
Fix eqjoinsel() to make use of new statistics.
1 parenta001f13 commit73d1040

File tree

1 file changed

+201
-41
lines changed

1 file changed

+201
-41
lines changed

‎src/backend/utils/adt/selfuncs.c

Lines changed: 201 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
*
1616
*
1717
* IDENTIFICATION
18-
* $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.90 2001/05/20 20:28:19 tgl Exp $
18+
* $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.91 2001/05/27 17:37:48 tgl Exp $
1919
*
2020
*-------------------------------------------------------------------------
2121
*/
@@ -940,9 +940,7 @@ Datum
940940
eqjoinsel(PG_FUNCTION_ARGS)
941941
{
942942
Query*root= (Query*)PG_GETARG_POINTER(0);
943-
#ifdefNOT_USED/* see neqjoinsel() before removing me! */
944943
Oidoperator=PG_GETARG_OID(1);
945-
#endif
946944
List*args= (List*)PG_GETARG_POINTER(2);
947945
Var*var1;
948946
Var*var2;
@@ -958,73 +956,219 @@ eqjoinsel(PG_FUNCTION_ARGS)
958956
HeapTuplestatsTuple2=NULL;
959957
Form_pg_statisticstats1=NULL;
960958
Form_pg_statisticstats2=NULL;
961-
doublend1,
962-
nd2;
963-
964-
if (var1==NULL)
965-
{
966-
nd1=DEFAULT_NUM_DISTINCT;
967-
}
968-
else
959+
doublend1=DEFAULT_NUM_DISTINCT;
960+
doublend2=DEFAULT_NUM_DISTINCT;
961+
boolhave_mcvs1= false;
962+
Datum*values1=NULL;
963+
intnvalues1=0;
964+
float4*numbers1=NULL;
965+
intnnumbers1=0;
966+
boolhave_mcvs2= false;
967+
Datum*values2=NULL;
968+
intnvalues2=0;
969+
float4*numbers2=NULL;
970+
intnnumbers2=0;
971+
972+
if (var1!=NULL)
969973
{
970974
/* get stats for the attribute, if available */
971975
Oidrelid1=getrelid(var1->varno,root->rtable);
972976

973-
if (relid1==InvalidOid)
974-
nd1=DEFAULT_NUM_DISTINCT;
975-
else
977+
if (relid1!=InvalidOid)
976978
{
977979
statsTuple1=SearchSysCache(STATRELATT,
978980
ObjectIdGetDatum(relid1),
979981
Int16GetDatum(var1->varattno),
980982
0,0);
981983
if (HeapTupleIsValid(statsTuple1))
984+
{
982985
stats1= (Form_pg_statistic)GETSTRUCT(statsTuple1);
986+
have_mcvs1=get_attstatsslot(statsTuple1,
987+
var1->vartype,
988+
var1->vartypmod,
989+
STATISTIC_KIND_MCV,
990+
InvalidOid,
991+
&values1,&nvalues1,
992+
&numbers1,&nnumbers1);
993+
}
983994

984995
nd1=get_att_numdistinct(root,var1,stats1);
985996
}
986997
}
987998

988-
if (var2==NULL)
989-
{
990-
nd2=DEFAULT_NUM_DISTINCT;
991-
}
992-
else
999+
if (var2!=NULL)
9931000
{
9941001
/* get stats for the attribute, if available */
9951002
Oidrelid2=getrelid(var2->varno,root->rtable);
9961003

997-
if (relid2==InvalidOid)
998-
nd2=DEFAULT_NUM_DISTINCT;
999-
else
1004+
if (relid2!=InvalidOid)
10001005
{
10011006
statsTuple2=SearchSysCache(STATRELATT,
10021007
ObjectIdGetDatum(relid2),
10031008
Int16GetDatum(var2->varattno),
10041009
0,0);
10051010
if (HeapTupleIsValid(statsTuple2))
1011+
{
10061012
stats2= (Form_pg_statistic)GETSTRUCT(statsTuple2);
1013+
have_mcvs2=get_attstatsslot(statsTuple2,
1014+
var2->vartype,
1015+
var2->vartypmod,
1016+
STATISTIC_KIND_MCV,
1017+
InvalidOid,
1018+
&values2,&nvalues2,
1019+
&numbers2,&nnumbers2);
1020+
}
10071021

10081022
nd2=get_att_numdistinct(root,var2,stats2);
10091023
}
10101024
}
10111025

1012-
/*
1013-
* Estimate the join selectivity as 1 / sqrt(nd1*nd2)
1014-
* (can we produce any theory for this)?
1015-
*
1016-
* XXX possibility to do better: if both attributes have histograms
1017-
* then we could determine the exact join selectivity between the
1018-
* MCV sets, and only have to assume the join behavior of the non-MCV
1019-
* values. This could be a big win when the MCVs cover a large part
1020-
* of the population.
1021-
*
1022-
* XXX what about nulls?
1023-
*/
1024-
selec=1.0 /sqrt(nd1*nd2);
1025-
if (selec>1.0)
1026-
selec=1.0;
1026+
if (have_mcvs1&&have_mcvs2)
1027+
{
1028+
/*
1029+
* We have most-common-value lists for both relations. Run
1030+
* through the lists to see which MCVs actually join to each
1031+
* other with the given operator. This allows us to determine
1032+
* the exact join selectivity for the portion of the relations
1033+
* represented by the MCV lists. We still have to estimate for
1034+
* the remaining population, but in a skewed distribution this
1035+
* gives us a big leg up in accuracy. For motivation see the
1036+
* analysis in Y. Ioannidis and S. Christodoulakis, "On the
1037+
* propagation of errors in the size of join results", Technical
1038+
* Report 1018, Computer Science Dept., University of Wisconsin,
1039+
* Madison, March 1991 (available from ftp.cs.wisc.edu).
1040+
*/
1041+
FmgrInfoeqproc;
1042+
bool*hasmatch1;
1043+
bool*hasmatch2;
1044+
doublematchprodfreq,
1045+
matchfreq1,
1046+
matchfreq2,
1047+
unmatchfreq1,
1048+
unmatchfreq2,
1049+
otherfreq1,
1050+
otherfreq2,
1051+
totalsel1,
1052+
totalsel2;
1053+
inti,
1054+
nmatches;
1055+
1056+
fmgr_info(get_opcode(operator),&eqproc);
1057+
hasmatch1= (bool*)palloc(nvalues1*sizeof(bool));
1058+
memset(hasmatch1,0,nvalues1*sizeof(bool));
1059+
hasmatch2= (bool*)palloc(nvalues2*sizeof(bool));
1060+
memset(hasmatch2,0,nvalues2*sizeof(bool));
1061+
/*
1062+
* Note we assume that each MCV will match at most one member of
1063+
* the other MCV list. If the operator isn't really equality,
1064+
* there could be multiple matches --- but we don't look for them,
1065+
* both for speed and because the math wouldn't add up...
1066+
*/
1067+
matchprodfreq=0.0;
1068+
nmatches=0;
1069+
for (i=0;i<nvalues1;i++)
1070+
{
1071+
intj;
10271072

1073+
for (j=0;j<nvalues2;j++)
1074+
{
1075+
if (hasmatch2[j])
1076+
continue;
1077+
if (DatumGetBool(FunctionCall2(&eqproc,
1078+
values1[i],
1079+
values2[j])))
1080+
{
1081+
hasmatch1[i]=hasmatch2[j]= true;
1082+
matchprodfreq+=numbers1[i]*numbers2[j];
1083+
nmatches++;
1084+
break;
1085+
}
1086+
}
1087+
}
1088+
/* Sum up frequencies of matched and unmatched MCVs */
1089+
matchfreq1=unmatchfreq1=0.0;
1090+
for (i=0;i<nvalues1;i++)
1091+
{
1092+
if (hasmatch1[i])
1093+
matchfreq1+=numbers1[i];
1094+
else
1095+
unmatchfreq1+=numbers1[i];
1096+
}
1097+
matchfreq2=unmatchfreq2=0.0;
1098+
for (i=0;i<nvalues2;i++)
1099+
{
1100+
if (hasmatch2[i])
1101+
matchfreq2+=numbers2[i];
1102+
else
1103+
unmatchfreq2+=numbers2[i];
1104+
}
1105+
pfree(hasmatch1);
1106+
pfree(hasmatch2);
1107+
/*
1108+
* Compute total frequency of non-null values that are not in
1109+
* the MCV lists.
1110+
*/
1111+
otherfreq1=1.0-stats1->stanullfrac-matchfreq1-unmatchfreq1;
1112+
otherfreq2=1.0-stats2->stanullfrac-matchfreq2-unmatchfreq2;
1113+
/*
1114+
* We can estimate the total selectivity from the point of view
1115+
* of relation 1 as: the known selectivity for matched MCVs, plus
1116+
* unmatched MCVs that are assumed to match against random members
1117+
* of relation 2's non-MCV population, plus non-MCV values that
1118+
* are assumed to match against random members of relation 2's
1119+
* unmatched MCVs plus non-MCV values.
1120+
*/
1121+
totalsel1=matchprodfreq;
1122+
if (nd2>nvalues2)
1123+
totalsel1+=unmatchfreq1*otherfreq2 / (nd2-nvalues2);
1124+
if (nd2>nmatches)
1125+
totalsel1+=otherfreq1* (otherfreq2+unmatchfreq2) /
1126+
(nd2-nmatches);
1127+
/* Same estimate from the point of view of relation 2. */
1128+
totalsel2=matchprodfreq;
1129+
if (nd1>nvalues1)
1130+
totalsel2+=unmatchfreq2*otherfreq1 / (nd1-nvalues1);
1131+
if (nd1>nmatches)
1132+
totalsel2+=otherfreq2* (otherfreq1+unmatchfreq1) /
1133+
(nd1-nmatches);
1134+
/*
1135+
* For robustness, we average the two estimates. (Can a case
1136+
* be made for taking the min or max instead?)
1137+
*/
1138+
selec= (totalsel1+totalsel2)*0.5;
1139+
}
1140+
else
1141+
{
1142+
/*
1143+
* We do not have MCV lists for both sides. Estimate the
1144+
* join selectivity as MIN(1/nd1, 1/nd2). This is plausible
1145+
* if we assume that the values are about equally distributed:
1146+
* a given tuple of rel1 will join to either 0 or N2/nd2 rows
1147+
* of rel2, so total join rows are at most N1*N2/nd2 giving
1148+
* a join selectivity of not more than 1/nd2. By the same logic
1149+
* it is not more than 1/nd1, so MIN(1/nd1, 1/nd2) is an upper
1150+
* bound. Using the MIN() means we estimate from the point of
1151+
* view of the relation with smaller nd (since the larger nd is
1152+
* determining the MIN). It is reasonable to assume that most
1153+
* tuples in this rel will have join partners, so the bound is
1154+
* probably reasonably tight and should be taken as-is.
1155+
*
1156+
* XXX Can we be smarter if we have an MCV list for just one side?
1157+
* It seems that if we assume equal distribution for the other
1158+
* side, we end up with the same answer anyway.
1159+
*/
1160+
if (nd1>nd2)
1161+
selec=1.0 /nd1;
1162+
else
1163+
selec=1.0 /nd2;
1164+
}
1165+
1166+
if (have_mcvs1)
1167+
free_attstatsslot(var1->vartype,values1,nvalues1,
1168+
numbers1,nnumbers1);
1169+
if (have_mcvs2)
1170+
free_attstatsslot(var2->vartype,values2,nvalues2,
1171+
numbers2,nnumbers2);
10281172
if (HeapTupleIsValid(statsTuple1))
10291173
ReleaseSysCache(statsTuple1);
10301174
if (HeapTupleIsValid(statsTuple2))
@@ -1039,14 +1183,30 @@ eqjoinsel(PG_FUNCTION_ARGS)
10391183
Datum
10401184
neqjoinsel(PG_FUNCTION_ARGS)
10411185
{
1186+
Query*root= (Query*)PG_GETARG_POINTER(0);
1187+
Oidoperator=PG_GETARG_OID(1);
1188+
List*args= (List*)PG_GETARG_POINTER(2);
1189+
Oideqop;
10421190
float8result;
10431191

10441192
/*
1045-
* XXX we skip looking up the negator operator here because we know
1046-
* eqjoinsel() won't look at it anyway. If eqjoinsel() ever does
1047-
* look, this routine will need to look more like neqsel() does.
1193+
* We want 1 - eqjoinsel() where the equality operator is the one
1194+
* associated with this != operator, that is, its negator.
10481195
*/
1049-
result=DatumGetFloat8(eqjoinsel(fcinfo));
1196+
eqop=get_negator(operator);
1197+
if (eqop)
1198+
{
1199+
result=DatumGetFloat8(DirectFunctionCall3(eqjoinsel,
1200+
PointerGetDatum(root),
1201+
ObjectIdGetDatum(eqop),
1202+
PointerGetDatum(args)));
1203+
1204+
}
1205+
else
1206+
{
1207+
/* Use default selectivity (should we raise an error instead?) */
1208+
result=DEFAULT_EQ_SEL;
1209+
}
10501210
result=1.0-result;
10511211
PG_RETURN_FLOAT8(result);
10521212
}

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp