NotificationsYou must be signed in to change notification settings
Fork6
Star31

Commit73d1040

committed

Fix eqjoinsel() to make use of new statistics.

1 parenta001f13 commit73d1040Copy full SHA for 73d1040

File tree

1 file changed

+201

-41

lines changed

src/backend/utils/adt
- selfuncs.c

1 file changed

+201

-41

lines changed

`‎src/backend/utils/adt/selfuncs.c`

Lines changed: 201 additions & 41 deletions

Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,7 @@`
`15`	`15`	`*`
`16`	`16`	`*`
`17`	`17`	`* IDENTIFICATION`
`18`		`- * $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.90 2001/05/20 20:28:19 tgl Exp $`
	`18`	`+ * $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.91 2001/05/27 17:37:48 tgl Exp $`
`19`	`19`	`*`
`20`	`20`	`*-------------------------------------------------------------------------`
`21`	`21`	`*/`
`@@ -940,9 +940,7 @@ Datum`
`940`	`940`	`eqjoinsel(PG_FUNCTION_ARGS)`
`941`	`941`	`{`
`942`	`942`	`Queryroot= (Query)PG_GETARG_POINTER(0);`
`943`		`-#ifdefNOT_USED/* see neqjoinsel() before removing me! */`
`944`	`943`	`Oidoperator=PG_GETARG_OID(1);`
`945`		`-#endif`
`946`	`944`	`Listargs= (List)PG_GETARG_POINTER(2);`
`947`	`945`	`Var*var1;`
`948`	`946`	`Var*var2;`
`@@ -958,73 +956,219 @@ eqjoinsel(PG_FUNCTION_ARGS)`
`958`	`956`	`HeapTuplestatsTuple2=NULL;`
`959`	`957`	`Form_pg_statisticstats1=NULL;`
`960`	`958`	`Form_pg_statisticstats2=NULL;`
`961`		`-doublend1,`
`962`		`-nd2;`
`963`		`-`
`964`		`-if (var1==NULL)`
`965`		`-{`
`966`		`-nd1=DEFAULT_NUM_DISTINCT;`
`967`		`-}`
`968`		`-else`
	`959`	`+doublend1=DEFAULT_NUM_DISTINCT;`
	`960`	`+doublend2=DEFAULT_NUM_DISTINCT;`
	`961`	`+boolhave_mcvs1= false;`
	`962`	`+Datum*values1=NULL;`
	`963`	`+intnvalues1=0;`
	`964`	`+float4*numbers1=NULL;`
	`965`	`+intnnumbers1=0;`
	`966`	`+boolhave_mcvs2= false;`
	`967`	`+Datum*values2=NULL;`
	`968`	`+intnvalues2=0;`
	`969`	`+float4*numbers2=NULL;`
	`970`	`+intnnumbers2=0;`
	`971`	`+`
	`972`	`+if (var1!=NULL)`
`969`	`973`	`{`
`970`	`974`	`/* get stats for the attribute, if available */`
`971`	`975`	`Oidrelid1=getrelid(var1->varno,root->rtable);`
`972`	`976`
`973`		`-if (relid1==InvalidOid)`
`974`		`-nd1=DEFAULT_NUM_DISTINCT;`
`975`		`-else`
	`977`	`+if (relid1!=InvalidOid)`
`976`	`978`	`{`
`977`	`979`	`statsTuple1=SearchSysCache(STATRELATT,`
`978`	`980`	`ObjectIdGetDatum(relid1),`
`979`	`981`	`Int16GetDatum(var1->varattno),`
`980`	`982`	`0,0);`
`981`	`983`	`if (HeapTupleIsValid(statsTuple1))`
	`984`	`+{`
`982`	`985`	`stats1= (Form_pg_statistic)GETSTRUCT(statsTuple1);`
	`986`	`+have_mcvs1=get_attstatsslot(statsTuple1,`
	`987`	`+var1->vartype,`
	`988`	`+var1->vartypmod,`
	`989`	`+STATISTIC_KIND_MCV,`
	`990`	`+InvalidOid,`
	`991`	`+&values1,&nvalues1,`
	`992`	`+&numbers1,&nnumbers1);`
	`993`	`+}`
`983`	`994`
`984`	`995`	`nd1=get_att_numdistinct(root,var1,stats1);`
`985`	`996`	`}`
`986`	`997`	`}`
`987`	`998`
`988`		`-if (var2==NULL)`
`989`		`-{`
`990`		`-nd2=DEFAULT_NUM_DISTINCT;`
`991`		`-}`
`992`		`-else`
	`999`	`+if (var2!=NULL)`
`993`	`1000`	`{`
`994`	`1001`	`/* get stats for the attribute, if available */`
`995`	`1002`	`Oidrelid2=getrelid(var2->varno,root->rtable);`
`996`	`1003`
`997`		`-if (relid2==InvalidOid)`
`998`		`-nd2=DEFAULT_NUM_DISTINCT;`
`999`		`-else`
	`1004`	`+if (relid2!=InvalidOid)`
`1000`	`1005`	`{`
`1001`	`1006`	`statsTuple2=SearchSysCache(STATRELATT,`
`1002`	`1007`	`ObjectIdGetDatum(relid2),`
`1003`	`1008`	`Int16GetDatum(var2->varattno),`
`1004`	`1009`	`0,0);`
`1005`	`1010`	`if (HeapTupleIsValid(statsTuple2))`
	`1011`	`+{`
`1006`	`1012`	`stats2= (Form_pg_statistic)GETSTRUCT(statsTuple2);`
	`1013`	`+have_mcvs2=get_attstatsslot(statsTuple2,`
	`1014`	`+var2->vartype,`
	`1015`	`+var2->vartypmod,`
	`1016`	`+STATISTIC_KIND_MCV,`
	`1017`	`+InvalidOid,`
	`1018`	`+&values2,&nvalues2,`
	`1019`	`+&numbers2,&nnumbers2);`
	`1020`	`+}`
`1007`	`1021`
`1008`	`1022`	`nd2=get_att_numdistinct(root,var2,stats2);`
`1009`	`1023`	`}`
`1010`	`1024`	`}`
`1011`	`1025`
`1012`		`-/*`
`1013`		`- * Estimate the join selectivity as 1 / sqrt(nd1*nd2)`
`1014`		`- * (can we produce any theory for this)?`
`1015`		`- *`
`1016`		`- * XXX possibility to do better: if both attributes have histograms`
`1017`		`- * then we could determine the exact join selectivity between the`
`1018`		`- * MCV sets, and only have to assume the join behavior of the non-MCV`
`1019`		`- * values. This could be a big win when the MCVs cover a large part`
`1020`		`- * of the population.`
`1021`		`- *`
`1022`		`- * XXX what about nulls?`
`1023`		`- */`
`1024`		`-selec=1.0 /sqrt(nd1*nd2);`
`1025`		`-if (selec>1.0)`
`1026`		`-selec=1.0;`
	`1026`	`+if (have_mcvs1&&have_mcvs2)`
	`1027`	`+{`
	`1028`	`+/*`
	`1029`	`+ * We have most-common-value lists for both relations. Run`
	`1030`	`+ * through the lists to see which MCVs actually join to each`
	`1031`	`+ * other with the given operator. This allows us to determine`
	`1032`	`+ * the exact join selectivity for the portion of the relations`
	`1033`	`+ * represented by the MCV lists. We still have to estimate for`
	`1034`	`+ * the remaining population, but in a skewed distribution this`
	`1035`	`+ * gives us a big leg up in accuracy. For motivation see the`
	`1036`	`+ * analysis in Y. Ioannidis and S. Christodoulakis, "On the`
	`1037`	`+ * propagation of errors in the size of join results", Technical`
	`1038`	`+ * Report 1018, Computer Science Dept., University of Wisconsin,`
	`1039`	`+ * Madison, March 1991 (available from ftp.cs.wisc.edu).`
	`1040`	`+ */`
	`1041`	`+FmgrInfoeqproc;`
	`1042`	`+bool*hasmatch1;`
	`1043`	`+bool*hasmatch2;`
	`1044`	`+doublematchprodfreq,`
	`1045`	`+matchfreq1,`
	`1046`	`+matchfreq2,`
	`1047`	`+unmatchfreq1,`
	`1048`	`+unmatchfreq2,`
	`1049`	`+otherfreq1,`
	`1050`	`+otherfreq2,`
	`1051`	`+totalsel1,`
	`1052`	`+totalsel2;`
	`1053`	`+inti,`
	`1054`	`+nmatches;`
	`1055`	`+`
	`1056`	`+fmgr_info(get_opcode(operator),&eqproc);`
	`1057`	`+hasmatch1= (bool)palloc(nvalues1sizeof(bool));`
	`1058`	`+memset(hasmatch1,0,nvalues1*sizeof(bool));`
	`1059`	`+hasmatch2= (bool)palloc(nvalues2sizeof(bool));`
	`1060`	`+memset(hasmatch2,0,nvalues2*sizeof(bool));`
	`1061`	`+/*`
	`1062`	`+ * Note we assume that each MCV will match at most one member of`
	`1063`	`+ * the other MCV list. If the operator isn't really equality,`
	`1064`	`+ * there could be multiple matches --- but we don't look for them,`
	`1065`	`+ * both for speed and because the math wouldn't add up...`
	`1066`	`+ */`
	`1067`	`+matchprodfreq=0.0;`
	`1068`	`+nmatches=0;`
	`1069`	`+for (i=0;i<nvalues1;i++)`
	`1070`	`+{`
	`1071`	`+intj;`
`1027`	`1072`
	`1073`	`+for (j=0;j<nvalues2;j++)`
	`1074`	`+{`
	`1075`	`+if (hasmatch2[j])`
	`1076`	`+continue;`
	`1077`	`+if (DatumGetBool(FunctionCall2(&eqproc,`
	`1078`	`+values1[i],`
	`1079`	`+values2[j])))`
	`1080`	`+{`
	`1081`	`+hasmatch1[i]=hasmatch2[j]= true;`
	`1082`	`+matchprodfreq+=numbers1[i]*numbers2[j];`
	`1083`	`+nmatches++;`
	`1084`	`+break;`
	`1085`	`+}`
	`1086`	`+}`
	`1087`	`+}`
	`1088`	`+/* Sum up frequencies of matched and unmatched MCVs */`
	`1089`	`+matchfreq1=unmatchfreq1=0.0;`
	`1090`	`+for (i=0;i<nvalues1;i++)`
	`1091`	`+{`
	`1092`	`+if (hasmatch1[i])`
	`1093`	`+matchfreq1+=numbers1[i];`
	`1094`	`+else`
	`1095`	`+unmatchfreq1+=numbers1[i];`
	`1096`	`+}`
	`1097`	`+matchfreq2=unmatchfreq2=0.0;`
	`1098`	`+for (i=0;i<nvalues2;i++)`
	`1099`	`+{`
	`1100`	`+if (hasmatch2[i])`
	`1101`	`+matchfreq2+=numbers2[i];`
	`1102`	`+else`
	`1103`	`+unmatchfreq2+=numbers2[i];`
	`1104`	`+}`
	`1105`	`+pfree(hasmatch1);`
	`1106`	`+pfree(hasmatch2);`
	`1107`	`+/*`
	`1108`	`+ * Compute total frequency of non-null values that are not in`
	`1109`	`+ * the MCV lists.`
	`1110`	`+ */`
	`1111`	`+otherfreq1=1.0-stats1->stanullfrac-matchfreq1-unmatchfreq1;`
	`1112`	`+otherfreq2=1.0-stats2->stanullfrac-matchfreq2-unmatchfreq2;`
	`1113`	`+/*`
	`1114`	`+ * We can estimate the total selectivity from the point of view`
	`1115`	`+ * of relation 1 as: the known selectivity for matched MCVs, plus`
	`1116`	`+ * unmatched MCVs that are assumed to match against random members`
	`1117`	`+ * of relation 2's non-MCV population, plus non-MCV values that`
	`1118`	`+ * are assumed to match against random members of relation 2's`
	`1119`	`+ * unmatched MCVs plus non-MCV values.`
	`1120`	`+ */`
	`1121`	`+totalsel1=matchprodfreq;`
	`1122`	`+if (nd2>nvalues2)`
	`1123`	`+totalsel1+=unmatchfreq1*otherfreq2 / (nd2-nvalues2);`
	`1124`	`+if (nd2>nmatches)`
	`1125`	`+totalsel1+=otherfreq1* (otherfreq2+unmatchfreq2) /`
	`1126`	`+(nd2-nmatches);`
	`1127`	`+/* Same estimate from the point of view of relation 2. */`
	`1128`	`+totalsel2=matchprodfreq;`
	`1129`	`+if (nd1>nvalues1)`
	`1130`	`+totalsel2+=unmatchfreq2*otherfreq1 / (nd1-nvalues1);`
	`1131`	`+if (nd1>nmatches)`
	`1132`	`+totalsel2+=otherfreq2* (otherfreq1+unmatchfreq1) /`
	`1133`	`+(nd1-nmatches);`
	`1134`	`+/*`
	`1135`	`+ * For robustness, we average the two estimates. (Can a case`
	`1136`	`+ * be made for taking the min or max instead?)`
	`1137`	`+ */`
	`1138`	`+selec= (totalsel1+totalsel2)*0.5;`
	`1139`	`+}`
	`1140`	`+else`
	`1141`	`+{`
	`1142`	`+/*`
	`1143`	`+ * We do not have MCV lists for both sides. Estimate the`
	`1144`	`+ * join selectivity as MIN(1/nd1, 1/nd2). This is plausible`
	`1145`	`+ * if we assume that the values are about equally distributed:`
	`1146`	`+ * a given tuple of rel1 will join to either 0 or N2/nd2 rows`
	`1147`	`+ * of rel2, so total join rows are at most N1*N2/nd2 giving`
	`1148`	`+ * a join selectivity of not more than 1/nd2. By the same logic`
	`1149`	`+ * it is not more than 1/nd1, so MIN(1/nd1, 1/nd2) is an upper`
	`1150`	`+ * bound. Using the MIN() means we estimate from the point of`
	`1151`	`+ * view of the relation with smaller nd (since the larger nd is`
	`1152`	`+ * determining the MIN). It is reasonable to assume that most`
	`1153`	`+ * tuples in this rel will have join partners, so the bound is`
	`1154`	`+ * probably reasonably tight and should be taken as-is.`
	`1155`	`+ *`
	`1156`	`+ * XXX Can we be smarter if we have an MCV list for just one side?`
	`1157`	`+ * It seems that if we assume equal distribution for the other`
	`1158`	`+ * side, we end up with the same answer anyway.`
	`1159`	`+ */`
	`1160`	`+if (nd1>nd2)`
	`1161`	`+selec=1.0 /nd1;`
	`1162`	`+else`
	`1163`	`+selec=1.0 /nd2;`
	`1164`	`+}`
	`1165`	`+`
	`1166`	`+if (have_mcvs1)`
	`1167`	`+free_attstatsslot(var1->vartype,values1,nvalues1,`
	`1168`	`+numbers1,nnumbers1);`
	`1169`	`+if (have_mcvs2)`
	`1170`	`+free_attstatsslot(var2->vartype,values2,nvalues2,`
	`1171`	`+numbers2,nnumbers2);`
`1028`	`1172`	`if (HeapTupleIsValid(statsTuple1))`
`1029`	`1173`	`ReleaseSysCache(statsTuple1);`
`1030`	`1174`	`if (HeapTupleIsValid(statsTuple2))`
`@@ -1039,14 +1183,30 @@ eqjoinsel(PG_FUNCTION_ARGS)`
`1039`	`1183`	`Datum`
`1040`	`1184`	`neqjoinsel(PG_FUNCTION_ARGS)`
`1041`	`1185`	`{`
	`1186`	`+Queryroot= (Query)PG_GETARG_POINTER(0);`
	`1187`	`+Oidoperator=PG_GETARG_OID(1);`
	`1188`	`+Listargs= (List)PG_GETARG_POINTER(2);`
	`1189`	`+Oideqop;`
`1042`	`1190`	`float8result;`
`1043`	`1191`
`1044`	`1192`	`/*`
`1045`		`- * XXX we skip looking up the negator operator here because we know`
`1046`		`- * eqjoinsel() won't look at it anyway. If eqjoinsel() ever does`
`1047`		`- * look, this routine will need to look more like neqsel() does.`
	`1193`	`+ * We want 1 - eqjoinsel() where the equality operator is the one`
	`1194`	`+ * associated with this != operator, that is, its negator.`
`1048`	`1195`	`*/`
`1049`		`-result=DatumGetFloat8(eqjoinsel(fcinfo));`
	`1196`	`+eqop=get_negator(operator);`
	`1197`	`+if (eqop)`
	`1198`	`+{`
	`1199`	`+result=DatumGetFloat8(DirectFunctionCall3(eqjoinsel,`
	`1200`	`+PointerGetDatum(root),`
	`1201`	`+ObjectIdGetDatum(eqop),`
	`1202`	`+PointerGetDatum(args)));`
	`1203`	`+`
	`1204`	`+}`
	`1205`	`+else`
	`1206`	`+{`
	`1207`	`+/* Use default selectivity (should we raise an error instead?) */`
	`1208`	`+result=DEFAULT_EQ_SEL;`
	`1209`	`+}`
`1050`	`1210`	`result=1.0-result;`
`1051`	`1211`	`PG_RETURN_FLOAT8(result);`
`1052`	`1212`	`}`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit73d1040

File tree

1 file changed

1 file changed

`‎src/backend/utils/adt/selfuncs.c`

0 commit comments