1515 *
1616 *
1717 * IDENTIFICATION
18- * $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.244 2008/03/08 22:41:38 tgl Exp $
18+ * $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.245 2008/03/09 00:32:09 tgl Exp $
1919 *
2020 *-------------------------------------------------------------------------
2121 */
@@ -567,17 +567,23 @@ mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
567567 * or not it has anything to do with the histogram sort operator. We are
568568 * essentially using the histogram just as a representative sample. However,
569569 * small histograms are unlikely to be all that representative, so the caller
570- * should specify a minimum histogram size to use, and fall back on some
571- * other approach if this routine fails.
570+ * should be prepared to fall back on some other estimation approach when the
571+ * histogram is missing or very small. It may also be prudent to combine this
572+ * approach with another one when the histogram is small.
572573 *
573- * The caller also specifies n_skip, which causes us to ignore the first and
574- * last n_skip histogram elements, on the grounds that they are outliers and
575- * hence not very representative. If in doubt, min_hist_size = 100 and
576- * n_skip = 1 are reasonable values.
574+ * If the actual histogram size is not at least min_hist_size, we won't bother
575+ * to do the calculation at all. Also, if the n_skip parameter is > 0, we
576+ * ignore the first and last n_skip histogram elements, on the grounds that
577+ * they are outliers and hence not very representative. Typical values for
578+ * these parameters are 10 and 1.
577579 *
578580 * The function result is the selectivity, or -1 if there is no histogram
579581 * or it's smaller than min_hist_size.
580582 *
583+ * The output parameter *hist_size receives the actual histogram size,
584+ * or zero if no histogram. Callers may use this number to decide how
585+ * much faith to put in the function result.
586+ *
581587 * Note that the result disregards both the most-common-values (if any) and
582588 * null entries. The caller is expected to combine this result with
583589 * statistics for those portions of the column population.It may also be
@@ -586,7 +592,8 @@ mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
586592double
587593histogram_selectivity (VariableStatData * vardata ,FmgrInfo * opproc ,
588594Datum constval ,bool varonleft ,
589- int min_hist_size ,int n_skip )
595+ int min_hist_size ,int n_skip ,
596+ int * hist_size )
590597{
591598double result ;
592599Datum * values ;
@@ -603,6 +610,7 @@ histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
603610& values ,& nvalues ,
604611NULL ,NULL ))
605612{
613+ * hist_size = nvalues ;
606614if (nvalues >=min_hist_size )
607615{
608616int nmatch = 0 ;
@@ -626,7 +634,10 @@ histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
626634free_attstatsslot (vardata -> atttype ,values ,nvalues ,NULL ,0 );
627635}
628636else
637+ {
638+ * hist_size = 0 ;
629639result = -1 ;
640+ }
630641
631642return result ;
632643}
@@ -1117,13 +1128,16 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
11171128 * selectivity of the fixed prefix and remainder of pattern
11181129 * separately, then combine the two to get an estimate of the
11191130 * selectivity for the part of the column population represented by
1120- * the histogram. We then add up data for any most-common-values
1121- * values; these are not in the histogram population, and we can get
1122- * exact answers for them by applying the pattern operator, so there's
1123- * no reason to approximate. (If the MCVs cover a significant part of
1124- * the total population, this gives us a big leg up in accuracy.)
1131+ * the histogram. (For small histograms, we combine these approaches.)
1132+ *
1133+ * We then add up data for any most-common-values values; these are
1134+ * not in the histogram population, and we can get exact answers for
1135+ * them by applying the pattern operator, so there's no reason to
1136+ * approximate. (If the MCVs cover a significant part of the total
1137+ * population, this gives us a big leg up in accuracy.)
11251138 */
11261139Selectivity selec ;
1140+ int hist_size ;
11271141FmgrInfo opproc ;
11281142double nullfrac ,
11291143mcv_selec ,
@@ -1133,10 +1147,12 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
11331147fmgr_info (get_opcode (operator ),& opproc );
11341148
11351149selec = histogram_selectivity (& vardata ,& opproc ,constval , true,
1136- 100 ,1 );
1137- if (selec < 0 )
1150+ 10 ,1 ,& hist_size );
1151+
1152+ /* If not at least 100 entries, use the heuristic method */
1153+ if (hist_size < 100 )
11381154{
1139- /* Nope, so fake it with the heuristic method */
1155+ Selectivity heursel ;
11401156Selectivity prefixsel ;
11411157Selectivity restsel ;
11421158
@@ -1146,17 +1162,29 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
11461162else
11471163prefixsel = 1.0 ;
11481164restsel = pattern_selectivity (rest ,ptype );
1149- selec = prefixsel * restsel ;
1150- }
1151- else
1152- {
1153- /* Yes, but don't believe extremely small or large estimates. */
1154- if (selec < 0.0001 )
1155- selec = 0.0001 ;
1156- else if (selec > 0.9999 )
1157- selec = 0.9999 ;
1165+ heursel = prefixsel * restsel ;
1166+
1167+ if (selec < 0 )/* fewer than 10 histogram entries? */
1168+ selec = heursel ;
1169+ else
1170+ {
1171+ /*
1172+ * For histogram sizes from 10 to 100, we combine the
1173+ * histogram and heuristic selectivities, putting increasingly
1174+ * more trust in the histogram for larger sizes.
1175+ */
1176+ double hist_weight = hist_size /100.0 ;
1177+
1178+ selec = selec * hist_weight + heursel * (1.0 - hist_weight );
1179+ }
11581180}
11591181
1182+ /* In any case, don't believe extremely small or large estimates. */
1183+ if (selec < 0.0001 )
1184+ selec = 0.0001 ;
1185+ else if (selec > 0.9999 )
1186+ selec = 0.9999 ;
1187+
11601188/*
11611189 * If we have most-common-values info, add up the fractions of the MCV
11621190 * entries that satisfy MCV OP PATTERN. These fractions contribute