Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitb67fc00

Browse files
committed
Be a little smarter about deciding how many most-common values to save.
1 parentbf9e01d commitb67fc00

File tree

1 file changed

+104
-13
lines changed

1 file changed

+104
-13
lines changed

‎src/backend/commands/analyze.c

Lines changed: 104 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
/*-------------------------------------------------------------------------
22
*
33
* analyze.c
4-
* the postgresoptimizer analyzer
4+
* the postgresstatistics generator
55
*
66
* Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group
77
* Portions Copyright (c) 1994, Regents of the University of California
88
*
99
*
1010
* IDENTIFICATION
11-
* $Header: /cvsroot/pgsql/src/backend/commands/analyze.c,v 1.18 2001/06/02 19:01:53 tgl Exp $
11+
* $Header: /cvsroot/pgsql/src/backend/commands/analyze.c,v 1.19 2001/06/06 21:29:17 tgl Exp $
1212
*
1313
*-------------------------------------------------------------------------
1414
*/
@@ -63,7 +63,7 @@ typedef struct
6363
/* These fields are set up by examine_attribute */
6464
intattnum;/* attribute number */
6565
AlgCodealgcode;/* Which algorithm to use for this column */
66-
intminrows;/* Minimum # of rowsneeded for stats */
66+
intminrows;/* Minimum # of rowswanted for stats */
6767
Form_pg_attributeattr;/* copy of pg_attribute row for column */
6868
Form_pg_typeattrtype;/* copy of pg_type row for column */
6969
Oideqopr;/* '=' operator for datatype, if any */
@@ -990,7 +990,9 @@ compute_minimal_stats(VacAttrStats *stats,
990990
* exactly k times in our sample of r rows (from a total of n).
991991
* We assume (not very reliably!) that all the multiply-occurring
992992
* values are reflected in the final track[] list, and the other
993-
* nonnull values all appeared but once.
993+
* nonnull values all appeared but once. (XXX this usually
994+
* results in a drastic overestimate of ndistinct. Can we do
995+
* any better?)
994996
*----------
995997
*/
996998
intf1=nonnull_cnt-summultiple;
@@ -1011,9 +1013,49 @@ compute_minimal_stats(VacAttrStats *stats,
10111013
if (stats->stadistinct>0.1*totalrows)
10121014
stats->stadistinct=- (stats->stadistinct /totalrows);
10131015

1014-
/* Generate an MCV slot entry, only if we found multiples */
1015-
if (nmultiple<num_mcv)
1016-
num_mcv=nmultiple;
1016+
/*
1017+
* Decide how many values are worth storing as most-common values.
1018+
* If we are able to generate a complete MCV list (all the values
1019+
* in the sample will fit, and we think these are all the ones in
1020+
* the table), then do so. Otherwise, store only those values
1021+
* that are significantly more common than the (estimated) average.
1022+
* We set the threshold rather arbitrarily at 25% more than average,
1023+
* with at least 2 instances in the sample.
1024+
*/
1025+
if (track_cnt<track_max&&toowide_cnt==0&&
1026+
stats->stadistinct>0&&
1027+
track_cnt <=num_mcv)
1028+
{
1029+
/* Track list includes all values seen, and all will fit */
1030+
num_mcv=track_cnt;
1031+
}
1032+
else
1033+
{
1034+
doublendistinct=stats->stadistinct;
1035+
doubleavgcount,
1036+
mincount;
1037+
1038+
if (ndistinct<0)
1039+
ndistinct=-ndistinct*totalrows;
1040+
/* estimate # of occurrences in sample of a typical value */
1041+
avgcount= (double)numrows /ndistinct;
1042+
/* set minimum threshold count to store a value */
1043+
mincount=avgcount*1.25;
1044+
if (mincount<2)
1045+
mincount=2;
1046+
if (num_mcv>track_cnt)
1047+
num_mcv=track_cnt;
1048+
for (i=0;i<num_mcv;i++)
1049+
{
1050+
if (track[i].count<mincount)
1051+
{
1052+
num_mcv=i;
1053+
break;
1054+
}
1055+
}
1056+
}
1057+
1058+
/* Generate MCV slot entry */
10171059
if (num_mcv>0)
10181060
{
10191061
MemoryContextold_context;
@@ -1080,6 +1122,7 @@ compute_scalar_stats(VacAttrStats *stats,
10801122
ScalarMCVItem*track;
10811123
inttrack_cnt=0;
10821124
intnum_mcv=stats->attr->attstattarget;
1125+
intnum_bins=stats->attr->attstattarget;
10831126

10841127
values= (ScalarItem*)palloc(numrows*sizeof(ScalarItem));
10851128
tupnoLink= (int*)palloc(numrows*sizeof(int));
@@ -1266,10 +1309,57 @@ compute_scalar_stats(VacAttrStats *stats,
12661309
if (stats->stadistinct>0.1*totalrows)
12671310
stats->stadistinct=- (stats->stadistinct /totalrows);
12681311

1269-
/* Generate an MCV slot entry, only if we found multiples */
1270-
if (nmultiple<num_mcv)
1271-
num_mcv=nmultiple;
1272-
Assert(track_cnt >=num_mcv);
1312+
/*
1313+
* Decide how many values are worth storing as most-common values.
1314+
* If we are able to generate a complete MCV list (all the values
1315+
* in the sample will fit, and we think these are all the ones in
1316+
* the table), then do so. Otherwise, store only those values
1317+
* that are significantly more common than the (estimated) average.
1318+
* We set the threshold rather arbitrarily at 25% more than average,
1319+
* with at least 2 instances in the sample. Also, we won't suppress
1320+
* values that have a frequency of at least 1/K where K is the
1321+
* intended number of histogram bins; such values might otherwise
1322+
* cause us to emit duplicate histogram bin boundaries.
1323+
*/
1324+
if (track_cnt==ndistinct&&toowide_cnt==0&&
1325+
stats->stadistinct>0&&
1326+
track_cnt <=num_mcv)
1327+
{
1328+
/* Track list includes all values seen, and all will fit */
1329+
num_mcv=track_cnt;
1330+
}
1331+
else
1332+
{
1333+
doublendistinct=stats->stadistinct;
1334+
doubleavgcount,
1335+
mincount,
1336+
maxmincount;
1337+
1338+
if (ndistinct<0)
1339+
ndistinct=-ndistinct*totalrows;
1340+
/* estimate # of occurrences in sample of a typical value */
1341+
avgcount= (double)numrows /ndistinct;
1342+
/* set minimum threshold count to store a value */
1343+
mincount=avgcount*1.25;
1344+
if (mincount<2)
1345+
mincount=2;
1346+
/* don't let threshold exceed 1/K, however */
1347+
maxmincount= (double)numrows / (double)num_bins;
1348+
if (mincount>maxmincount)
1349+
mincount=maxmincount;
1350+
if (num_mcv>track_cnt)
1351+
num_mcv=track_cnt;
1352+
for (i=0;i<num_mcv;i++)
1353+
{
1354+
if (track[i].count<mincount)
1355+
{
1356+
num_mcv=i;
1357+
break;
1358+
}
1359+
}
1360+
}
1361+
1362+
/* Generate MCV slot entry */
12731363
if (num_mcv>0)
12741364
{
12751365
MemoryContextold_context;
@@ -1304,8 +1394,8 @@ compute_scalar_stats(VacAttrStats *stats,
13041394
* ensures the histogram won't collapse to empty or a singleton.)
13051395
*/
13061396
num_hist=ndistinct-num_mcv;
1307-
if (num_hist>stats->attr->attstattarget)
1308-
num_hist=stats->attr->attstattarget+1;
1397+
if (num_hist>num_bins)
1398+
num_hist=num_bins+1;
13091399
if (num_hist >=2)
13101400
{
13111401
MemoryContextold_context;
@@ -1321,6 +1411,7 @@ compute_scalar_stats(VacAttrStats *stats,
13211411
*
13221412
* Note we destroy the values[] array here... but we don't need
13231413
* it for anything more. We do, however, still need values_cnt.
1414+
* nvals will be the number of remaining entries in values[].
13241415
*/
13251416
if (num_mcv>0)
13261417
{

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp