11/*-------------------------------------------------------------------------
22 *
33 * analyze.c
4- * the postgresoptimizer analyzer
4+ * the postgresstatistics generator
55 *
66 * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group
77 * Portions Copyright (c) 1994, Regents of the University of California
88 *
99 *
1010 * IDENTIFICATION
11- * $Header: /cvsroot/pgsql/src/backend/commands/analyze.c,v 1.18 2001/06/02 19:01:53 tgl Exp $
11+ * $Header: /cvsroot/pgsql/src/backend/commands/analyze.c,v 1.19 2001/06/06 21:29:17 tgl Exp $
1212 *
1313 *-------------------------------------------------------------------------
1414 */
@@ -63,7 +63,7 @@ typedef struct
6363/* These fields are set up by examine_attribute */
6464int attnum ;/* attribute number */
6565AlgCode algcode ;/* Which algorithm to use for this column */
66- int minrows ;/* Minimum # of rowsneeded for stats */
66+ int minrows ;/* Minimum # of rowswanted for stats */
6767Form_pg_attribute attr ;/* copy of pg_attribute row for column */
6868Form_pg_type attrtype ;/* copy of pg_type row for column */
6969Oid eqopr ;/* '=' operator for datatype, if any */
@@ -990,7 +990,9 @@ compute_minimal_stats(VacAttrStats *stats,
990990 * exactly k times in our sample of r rows (from a total of n).
991991 * We assume (not very reliably!) that all the multiply-occurring
992992 * values are reflected in the final track[] list, and the other
993- * nonnull values all appeared but once.
993+ * nonnull values all appeared but once. (XXX this usually
994+ * results in a drastic overestimate of ndistinct. Can we do
995+ * any better?)
994996 *----------
995997 */
996998int f1 = nonnull_cnt - summultiple ;
@@ -1011,9 +1013,49 @@ compute_minimal_stats(VacAttrStats *stats,
10111013if (stats -> stadistinct > 0.1 * totalrows )
10121014stats -> stadistinct = - (stats -> stadistinct /totalrows );
10131015
1014- /* Generate an MCV slot entry, only if we found multiples */
1015- if (nmultiple < num_mcv )
1016- num_mcv = nmultiple ;
1016+ /*
1017+ * Decide how many values are worth storing as most-common values.
1018+ * If we are able to generate a complete MCV list (all the values
1019+ * in the sample will fit, and we think these are all the ones in
1020+ * the table), then do so. Otherwise, store only those values
1021+ * that are significantly more common than the (estimated) average.
1022+ * We set the threshold rather arbitrarily at 25% more than average,
1023+ * with at least 2 instances in the sample.
1024+ */
1025+ if (track_cnt < track_max && toowide_cnt == 0 &&
1026+ stats -> stadistinct > 0 &&
1027+ track_cnt <=num_mcv )
1028+ {
1029+ /* Track list includes all values seen, and all will fit */
1030+ num_mcv = track_cnt ;
1031+ }
1032+ else
1033+ {
1034+ double ndistinct = stats -> stadistinct ;
1035+ double avgcount ,
1036+ mincount ;
1037+
1038+ if (ndistinct < 0 )
1039+ ndistinct = - ndistinct * totalrows ;
1040+ /* estimate # of occurrences in sample of a typical value */
1041+ avgcount = (double )numrows /ndistinct ;
1042+ /* set minimum threshold count to store a value */
1043+ mincount = avgcount * 1.25 ;
1044+ if (mincount < 2 )
1045+ mincount = 2 ;
1046+ if (num_mcv > track_cnt )
1047+ num_mcv = track_cnt ;
1048+ for (i = 0 ;i < num_mcv ;i ++ )
1049+ {
1050+ if (track [i ].count < mincount )
1051+ {
1052+ num_mcv = i ;
1053+ break ;
1054+ }
1055+ }
1056+ }
1057+
1058+ /* Generate MCV slot entry */
10171059if (num_mcv > 0 )
10181060{
10191061MemoryContext old_context ;
@@ -1080,6 +1122,7 @@ compute_scalar_stats(VacAttrStats *stats,
10801122ScalarMCVItem * track ;
10811123int track_cnt = 0 ;
10821124int num_mcv = stats -> attr -> attstattarget ;
1125+ int num_bins = stats -> attr -> attstattarget ;
10831126
10841127values = (ScalarItem * )palloc (numrows * sizeof (ScalarItem ));
10851128tupnoLink = (int * )palloc (numrows * sizeof (int ));
@@ -1266,10 +1309,57 @@ compute_scalar_stats(VacAttrStats *stats,
12661309if (stats -> stadistinct > 0.1 * totalrows )
12671310stats -> stadistinct = - (stats -> stadistinct /totalrows );
12681311
1269- /* Generate an MCV slot entry, only if we found multiples */
1270- if (nmultiple < num_mcv )
1271- num_mcv = nmultiple ;
1272- Assert (track_cnt >=num_mcv );
1312+ /*
1313+ * Decide how many values are worth storing as most-common values.
1314+ * If we are able to generate a complete MCV list (all the values
1315+ * in the sample will fit, and we think these are all the ones in
1316+ * the table), then do so. Otherwise, store only those values
1317+ * that are significantly more common than the (estimated) average.
1318+ * We set the threshold rather arbitrarily at 25% more than average,
1319+ * with at least 2 instances in the sample. Also, we won't suppress
1320+ * values that have a frequency of at least 1/K where K is the
1321+ * intended number of histogram bins; such values might otherwise
1322+ * cause us to emit duplicate histogram bin boundaries.
1323+ */
1324+ if (track_cnt == ndistinct && toowide_cnt == 0 &&
1325+ stats -> stadistinct > 0 &&
1326+ track_cnt <=num_mcv )
1327+ {
1328+ /* Track list includes all values seen, and all will fit */
1329+ num_mcv = track_cnt ;
1330+ }
1331+ else
1332+ {
1333+ double ndistinct = stats -> stadistinct ;
1334+ double avgcount ,
1335+ mincount ,
1336+ maxmincount ;
1337+
1338+ if (ndistinct < 0 )
1339+ ndistinct = - ndistinct * totalrows ;
1340+ /* estimate # of occurrences in sample of a typical value */
1341+ avgcount = (double )numrows /ndistinct ;
1342+ /* set minimum threshold count to store a value */
1343+ mincount = avgcount * 1.25 ;
1344+ if (mincount < 2 )
1345+ mincount = 2 ;
1346+ /* don't let threshold exceed 1/K, however */
1347+ maxmincount = (double )numrows / (double )num_bins ;
1348+ if (mincount > maxmincount )
1349+ mincount = maxmincount ;
1350+ if (num_mcv > track_cnt )
1351+ num_mcv = track_cnt ;
1352+ for (i = 0 ;i < num_mcv ;i ++ )
1353+ {
1354+ if (track [i ].count < mincount )
1355+ {
1356+ num_mcv = i ;
1357+ break ;
1358+ }
1359+ }
1360+ }
1361+
1362+ /* Generate MCV slot entry */
12731363if (num_mcv > 0 )
12741364{
12751365MemoryContext old_context ;
@@ -1304,8 +1394,8 @@ compute_scalar_stats(VacAttrStats *stats,
13041394 * ensures the histogram won't collapse to empty or a singleton.)
13051395 */
13061396num_hist = ndistinct - num_mcv ;
1307- if (num_hist > stats -> attr -> attstattarget )
1308- num_hist = stats -> attr -> attstattarget + 1 ;
1397+ if (num_hist > num_bins )
1398+ num_hist = num_bins + 1 ;
13091399if (num_hist >=2 )
13101400{
13111401MemoryContext old_context ;
@@ -1321,6 +1411,7 @@ compute_scalar_stats(VacAttrStats *stats,
13211411 *
13221412 * Note we destroy the values[] array here... but we don't need
13231413 * it for anything more. We do, however, still need values_cnt.
1414+ * nvals will be the number of remaining entries in values[].
13241415 */
13251416if (num_mcv > 0 )
13261417{