@@ -141,11 +141,10 @@ static double ineq_histogram_selectivity(PlannerInfo *root,
141141FmgrInfo * opproc ,bool isgt ,
142142Datum constval ,Oid consttype );
143143static double eqjoinsel_inner (Oid operator ,
144- VariableStatData * vardata1 ,VariableStatData * vardata2 ,
145- RelOptInfo * rel1 ,RelOptInfo * rel2 );
144+ VariableStatData * vardata1 ,VariableStatData * vardata2 );
146145static double eqjoinsel_semi (Oid operator ,
147146VariableStatData * vardata1 ,VariableStatData * vardata2 ,
148- RelOptInfo * rel1 , RelOptInfo * rel2 );
147+ RelOptInfo * inner_rel );
149148static bool convert_to_scalar (Datum value ,Oid valuetypid ,double * scaledvalue ,
150149Datum lobound ,Datum hibound ,Oid boundstypid ,
151150double * scaledlobound ,double * scaledhibound );
@@ -2010,47 +2009,35 @@ eqjoinsel(PG_FUNCTION_ARGS)
20102009VariableStatData vardata1 ;
20112010VariableStatData vardata2 ;
20122011bool join_is_reversed ;
2013- RelOptInfo * rel1 ;
2014- RelOptInfo * rel2 ;
2012+ RelOptInfo * inner_rel ;
20152013
20162014get_join_variables (root ,args ,sjinfo ,
20172015& vardata1 ,& vardata2 ,& join_is_reversed );
20182016
2019- /*
2020- * Identify the join's direct input relations. We use the min lefthand
2021- * and min righthand as the inputs, even though the join might actually
2022- * get done with larger input relations. The min inputs are guaranteed to
2023- * have been formed by now, though, and always using them ensures
2024- * consistency of estimates.
2025- */
2026- if (!join_is_reversed )
2027- {
2028- rel1 = find_join_input_rel (root ,sjinfo -> min_lefthand );
2029- rel2 = find_join_input_rel (root ,sjinfo -> min_righthand );
2030- }
2031- else
2032- {
2033- rel1 = find_join_input_rel (root ,sjinfo -> min_righthand );
2034- rel2 = find_join_input_rel (root ,sjinfo -> min_lefthand );
2035- }
2036-
20372017switch (sjinfo -> jointype )
20382018{
20392019case JOIN_INNER :
20402020case JOIN_LEFT :
20412021case JOIN_FULL :
2042- selec = eqjoinsel_inner (operator ,& vardata1 ,& vardata2 ,
2043- rel1 ,rel2 );
2022+ selec = eqjoinsel_inner (operator ,& vardata1 ,& vardata2 );
20442023break ;
20452024case JOIN_SEMI :
20462025case JOIN_ANTI :
2026+ /*
2027+ * Look up the join's inner relation. min_righthand is sufficient
2028+ * information because neither SEMI nor ANTI joins permit any
2029+ * reassociation into or out of their RHS, so the righthand will
2030+ * always be exactly that set of rels.
2031+ */
2032+ inner_rel = find_join_input_rel (root ,sjinfo -> min_righthand );
2033+
20472034if (!join_is_reversed )
20482035selec = eqjoinsel_semi (operator ,& vardata1 ,& vardata2 ,
2049- rel1 , rel2 );
2036+ inner_rel );
20502037else
20512038selec = eqjoinsel_semi (get_commutator (operator ),
20522039& vardata2 ,& vardata1 ,
2053- rel2 , rel1 );
2040+ inner_rel );
20542041break ;
20552042default :
20562043/* other values not expected here */
@@ -2076,8 +2063,7 @@ eqjoinsel(PG_FUNCTION_ARGS)
20762063 */
20772064static double
20782065eqjoinsel_inner (Oid operator ,
2079- VariableStatData * vardata1 ,VariableStatData * vardata2 ,
2080- RelOptInfo * rel1 ,RelOptInfo * rel2 )
2066+ VariableStatData * vardata1 ,VariableStatData * vardata2 )
20812067{
20822068double selec ;
20832069double nd1 ;
@@ -2272,26 +2258,10 @@ eqjoinsel_inner(Oid operator,
22722258 * XXX Can we be smarter if we have an MCV list for just one side? It
22732259 * seems that if we assume equal distribution for the other side, we
22742260 * end up with the same answer anyway.
2275- *
2276- * An additional hack we use here is to clamp the nd1 and nd2 values
2277- * to not more than what we are estimating the input relation sizes to
2278- * be, providing a crude correction for the selectivity of restriction
2279- * clauses on those relations.(We don't do that in the other path
2280- * since there we are comparing the nd values to stats for the whole
2281- * relations.) We can apply this clamp both with respect to the base
2282- * relations from which the join variables come, and to the immediate
2283- * input relations of the current join.
22842261 */
22852262double nullfrac1 = stats1 ?stats1 -> stanullfrac :0.0 ;
22862263double nullfrac2 = stats2 ?stats2 -> stanullfrac :0.0 ;
22872264
2288- if (vardata1 -> rel )
2289- nd1 = Min (nd1 ,vardata1 -> rel -> rows );
2290- nd1 = Min (nd1 ,rel1 -> rows );
2291- if (vardata2 -> rel )
2292- nd2 = Min (nd2 ,vardata2 -> rel -> rows );
2293- nd2 = Min (nd2 ,rel2 -> rows );
2294-
22952265selec = (1.0 - nullfrac1 )* (1.0 - nullfrac2 );
22962266if (nd1 > nd2 )
22972267selec /=nd1 ;
@@ -2318,7 +2288,7 @@ eqjoinsel_inner(Oid operator,
23182288static double
23192289eqjoinsel_semi (Oid operator ,
23202290VariableStatData * vardata1 ,VariableStatData * vardata2 ,
2321- RelOptInfo * rel1 , RelOptInfo * rel2 )
2291+ RelOptInfo * inner_rel )
23222292{
23232293double selec ;
23242294double nd1 ;
@@ -2338,6 +2308,25 @@ eqjoinsel_semi(Oid operator,
23382308nd1 = get_variable_numdistinct (vardata1 );
23392309nd2 = get_variable_numdistinct (vardata2 );
23402310
2311+ /*
2312+ * We clamp nd2 to be not more than what we estimate the inner relation's
2313+ * size to be. This is intuitively somewhat reasonable since obviously
2314+ * there can't be more than that many distinct values coming from the
2315+ * inner rel. The reason for the asymmetry (ie, that we don't clamp nd1
2316+ * likewise) is that this is the only pathway by which restriction clauses
2317+ * applied to the inner rel will affect the join result size estimate,
2318+ * since set_joinrel_size_estimates will multiply SEMI/ANTI selectivity by
2319+ * only the outer rel's size. If we clamped nd1 we'd be double-counting
2320+ * the selectivity of outer-rel restrictions.
2321+ *
2322+ * We can apply this clamping both with respect to the base relation from
2323+ * which the join variable comes (if there is just one), and to the
2324+ * immediate inner input relation of the current join.
2325+ */
2326+ if (vardata2 -> rel )
2327+ nd2 = Min (nd2 ,vardata2 -> rel -> rows );
2328+ nd2 = Min (nd2 ,inner_rel -> rows );
2329+
23412330if (HeapTupleIsValid (vardata1 -> statsTuple ))
23422331{
23432332stats1 = (Form_pg_statistic )GETSTRUCT (vardata1 -> statsTuple );
@@ -2381,11 +2370,21 @@ eqjoinsel_semi(Oid operator,
23812370uncertainfrac ,
23822371uncertain ;
23832372int i ,
2384- nmatches ;
2373+ nmatches ,
2374+ clamped_nvalues2 ;
2375+
2376+ /*
2377+ * The clamping above could have resulted in nd2 being less than
2378+ * nvalues2; in which case, we assume that precisely the nd2 most
2379+ * common values in the relation will appear in the join input, and so
2380+ * compare to only the first nd2 members of the MCV list. Of course
2381+ * this is frequently wrong, but it's the best bet we can make.
2382+ */
2383+ clamped_nvalues2 = Min (nvalues2 ,nd2 );
23852384
23862385fmgr_info (get_opcode (operator ),& eqproc );
23872386hasmatch1 = (bool * )palloc0 (nvalues1 * sizeof (bool ));
2388- hasmatch2 = (bool * )palloc0 (nvalues2 * sizeof (bool ));
2387+ hasmatch2 = (bool * )palloc0 (clamped_nvalues2 * sizeof (bool ));
23892388
23902389/*
23912390 * Note we assume that each MCV will match at most one member of the
@@ -2398,7 +2397,7 @@ eqjoinsel_semi(Oid operator,
23982397{
23992398int j ;
24002399
2401- for (j = 0 ;j < nvalues2 ;j ++ )
2400+ for (j = 0 ;j < clamped_nvalues2 ;j ++ )
24022401{
24032402if (hasmatch2 [j ])
24042403continue ;
@@ -2443,7 +2442,7 @@ eqjoinsel_semi(Oid operator,
24432442{
24442443nd1 -= nmatches ;
24452444nd2 -= nmatches ;
2446- if (nd1 <=nd2 || nd2 <= 0 )
2445+ if (nd1 <=nd2 || nd2 < 0 )
24472446uncertainfrac = 1.0 ;
24482447else
24492448uncertainfrac = nd2 /nd1 ;
@@ -2464,14 +2463,7 @@ eqjoinsel_semi(Oid operator,
24642463
24652464if (nd1 != DEFAULT_NUM_DISTINCT && nd2 != DEFAULT_NUM_DISTINCT )
24662465{
2467- if (vardata1 -> rel )
2468- nd1 = Min (nd1 ,vardata1 -> rel -> rows );
2469- nd1 = Min (nd1 ,rel1 -> rows );
2470- if (vardata2 -> rel )
2471- nd2 = Min (nd2 ,vardata2 -> rel -> rows );
2472- nd2 = Min (nd2 ,rel2 -> rows );
2473-
2474- if (nd1 <=nd2 || nd2 <=0 )
2466+ if (nd1 <=nd2 || nd2 < 0 )
24752467selec = 1.0 - nullfrac1 ;
24762468else
24772469selec = (nd2 /nd1 )* (1.0 - nullfrac1 );