4949 * Portions Copyright (c) 1994, Regents of the University of California
5050 *
5151 * IDENTIFICATION
52- * $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.124 2004/02/03 17:34:03 tgl Exp $
52+ * $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.125 2004/02/17 00:52:53 tgl Exp $
5353 *
5454 *-------------------------------------------------------------------------
5555 */
@@ -102,8 +102,6 @@ boolenable_mergejoin = true;
102102bool enable_hashjoin = true;
103103
104104
105- static Selectivity estimate_hash_bucketsize (Query * root ,Var * var ,
106- int nbuckets );
107105static bool cost_qual_eval_walker (Node * node ,QualCost * total );
108106static Selectivity approx_selectivity (Query * root ,List * quals ,
109107JoinType jointype );
@@ -1152,7 +1150,7 @@ cost_hashjoin(HashPath *path, Query *root)
11521150/* not cached yet */
11531151thisbucketsize =
11541152estimate_hash_bucketsize (root ,
1155- ( Var * ) get_rightop (restrictinfo -> clause ),
1153+ get_rightop (restrictinfo -> clause ),
11561154virtualbuckets );
11571155restrictinfo -> right_bucketsize = thisbucketsize ;
11581156}
@@ -1168,7 +1166,7 @@ cost_hashjoin(HashPath *path, Query *root)
11681166/* not cached yet */
11691167thisbucketsize =
11701168estimate_hash_bucketsize (root ,
1171- ( Var * ) get_leftop (restrictinfo -> clause ),
1169+ get_leftop (restrictinfo -> clause ),
11721170virtualbuckets );
11731171restrictinfo -> left_bucketsize = thisbucketsize ;
11741172}
@@ -1249,179 +1247,6 @@ cost_hashjoin(HashPath *path, Query *root)
12491247path -> jpath .path .total_cost = startup_cost + run_cost ;
12501248}
12511249
1252- /*
1253- * Estimate hash bucketsize fraction (ie, number of entries in a bucket
1254- * divided by total tuples in relation) if the specified Var is used
1255- * as a hash key.
1256- *
1257- * XXX This is really pretty bogus since we're effectively assuming that the
1258- * distribution of hash keys will be the same after applying restriction
1259- * clauses as it was in the underlying relation. However, we are not nearly
1260- * smart enough to figure out how the restrict clauses might change the
1261- * distribution, so this will have to do for now.
1262- *
1263- * We are passed the number of buckets the executor will use for the given
1264- * input relation.If the data were perfectly distributed, with the same
1265- * number of tuples going into each available bucket, then the bucketsize
1266- * fraction would be 1/nbuckets. But this happy state of affairs will occur
1267- * only if (a) there are at least nbuckets distinct data values, and (b)
1268- * we have a not-too-skewed data distribution.Otherwise the buckets will
1269- * be nonuniformly occupied. If the other relation in the join has a key
1270- * distribution similar to this one's, then the most-loaded buckets are
1271- * exactly those that will be probed most often. Therefore, the "average"
1272- * bucket size for costing purposes should really be taken as something close
1273- * to the "worst case" bucket size. We try to estimate this by adjusting the
1274- * fraction if there are too few distinct data values, and then scaling up
1275- * by the ratio of the most common value's frequency to the average frequency.
1276- *
1277- * If no statistics are available, use a default estimate of 0.1. This will
1278- * discourage use of a hash rather strongly if the inner relation is large,
1279- * which is what we want. We do not want to hash unless we know that the
1280- * inner rel is well-dispersed (or the alternatives seem much worse).
1281- */
1282- static Selectivity
1283- estimate_hash_bucketsize (Query * root ,Var * var ,int nbuckets )
1284- {
1285- Oid relid ;
1286- RelOptInfo * rel ;
1287- HeapTuple tuple ;
1288- Form_pg_statistic stats ;
1289- double estfract ,
1290- ndistinct ,
1291- mcvfreq ,
1292- avgfreq ;
1293- float4 * numbers ;
1294- int nnumbers ;
1295-
1296- /* Ignore any binary-compatible relabeling */
1297- if (var && IsA (var ,RelabelType ))
1298- var = (Var * ) ((RelabelType * )var )-> arg ;
1299-
1300- /*
1301- * Lookup info about var's relation and attribute; if none available,
1302- * return default estimate.
1303- */
1304- if (var == NULL || !IsA (var ,Var ))
1305- return 0.1 ;
1306-
1307- relid = getrelid (var -> varno ,root -> rtable );
1308- if (relid == InvalidOid )
1309- return 0.1 ;
1310-
1311- rel = find_base_rel (root ,var -> varno );
1312-
1313- if (rel -> tuples <=0.0 || rel -> rows <=0.0 )
1314- return 0.1 ;/* ensure we can divide below */
1315-
1316- tuple = SearchSysCache (STATRELATT ,
1317- ObjectIdGetDatum (relid ),
1318- Int16GetDatum (var -> varattno ),
1319- 0 ,0 );
1320- if (!HeapTupleIsValid (tuple ))
1321- {
1322- /*
1323- * If the attribute is known unique because of an index,
1324- * we can treat it as well-distributed.
1325- */
1326- if (has_unique_index (rel ,var -> varattno ))
1327- return 1.0 / (double )nbuckets ;
1328-
1329- /*
1330- * Perhaps the Var is a system attribute; if so, it will have no
1331- * entry in pg_statistic, but we may be able to guess something
1332- * about its distribution anyway.
1333- */
1334- switch (var -> varattno )
1335- {
1336- case ObjectIdAttributeNumber :
1337- case SelfItemPointerAttributeNumber :
1338- /* these are unique, so buckets should be well-distributed */
1339- return 1.0 / (double )nbuckets ;
1340- case TableOidAttributeNumber :
1341- /* hashing this is a terrible idea... */
1342- return 1.0 ;
1343- }
1344- return 0.1 ;
1345- }
1346- stats = (Form_pg_statistic )GETSTRUCT (tuple );
1347-
1348- /*
1349- * Obtain number of distinct data values in raw relation.
1350- */
1351- ndistinct = stats -> stadistinct ;
1352- if (ndistinct < 0.0 )
1353- ndistinct = - ndistinct * rel -> tuples ;
1354-
1355- if (ndistinct <=0.0 )/* ensure we can divide */
1356- {
1357- ReleaseSysCache (tuple );
1358- return 0.1 ;
1359- }
1360-
1361- /* Also compute avg freq of all distinct data values in raw relation */
1362- avgfreq = (1.0 - stats -> stanullfrac ) /ndistinct ;
1363-
1364- /*
1365- * Adjust ndistinct to account for restriction clauses. Observe we
1366- * are assuming that the data distribution is affected uniformly by
1367- * the restriction clauses!
1368- *
1369- * XXX Possibly better way, but much more expensive: multiply by
1370- * selectivity of rel's restriction clauses that mention the target
1371- * Var.
1372- */
1373- ndistinct *=rel -> rows /rel -> tuples ;
1374-
1375- /*
1376- * Initial estimate of bucketsize fraction is 1/nbuckets as long as
1377- * the number of buckets is less than the expected number of distinct
1378- * values; otherwise it is 1/ndistinct.
1379- */
1380- if (ndistinct > (double )nbuckets )
1381- estfract = 1.0 / (double )nbuckets ;
1382- else
1383- estfract = 1.0 /ndistinct ;
1384-
1385- /*
1386- * Look up the frequency of the most common value, if available.
1387- */
1388- mcvfreq = 0.0 ;
1389-
1390- if (get_attstatsslot (tuple ,var -> vartype ,var -> vartypmod ,
1391- STATISTIC_KIND_MCV ,InvalidOid ,
1392- NULL ,NULL ,& numbers ,& nnumbers ))
1393- {
1394- /*
1395- * The first MCV stat is for the most common value.
1396- */
1397- if (nnumbers > 0 )
1398- mcvfreq = numbers [0 ];
1399- free_attstatsslot (var -> vartype ,NULL ,0 ,
1400- numbers ,nnumbers );
1401- }
1402-
1403- /*
1404- * Adjust estimated bucketsize upward to account for skewed
1405- * distribution.
1406- */
1407- if (avgfreq > 0.0 && mcvfreq > avgfreq )
1408- estfract *=mcvfreq /avgfreq ;
1409-
1410- /*
1411- * Clamp bucketsize to sane range (the above adjustment could easily
1412- * produce an out-of-range result). We set the lower bound a little
1413- * above zero, since zero isn't a very sane result.
1414- */
1415- if (estfract < 1.0e-6 )
1416- estfract = 1.0e-6 ;
1417- else if (estfract > 1.0 )
1418- estfract = 1.0 ;
1419-
1420- ReleaseSysCache (tuple );
1421-
1422- return (Selectivity )estfract ;
1423- }
1424-
14251250
14261251/*
14271252 * cost_qual_eval