5454 * Portions Copyright (c) 1994, Regents of the University of California
5555 *
5656 * IDENTIFICATION
57- * $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.157 2006/06/05 20:56:33 tgl Exp $
57+ * $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.158 2006/06/06 17:59:57 tgl Exp $
5858 *
5959 *-------------------------------------------------------------------------
6060 */
@@ -133,7 +133,7 @@ clamp_row_est(double nrows)
133133 * better and to avoid possible divide-by-zero when interpolating costs.
134134 * Make it an integer, too.
135135 */
136- if (nrows < 1.0 )
136+ if (nrows <= 1.0 )
137137nrows = 1.0 ;
138138else
139139nrows = rint (nrows );
@@ -181,8 +181,10 @@ cost_seqscan(Path *path, PlannerInfo *root,
181181 *
182182 * 'index' is the index to be used
183183 * 'indexQuals' is the list of applicable qual clauses (implicit AND semantics)
184- * 'is_injoin' is T if we are considering using the index scan as the inside
185- *of a nestloop join (hence, some of the indexQuals are join clauses)
184+ * 'outer_rel' is the outer relation when we are considering using the index
185+ *scan as the inside of a nestloop join (hence, some of the indexQuals
186+ *are join clauses, and we should expect repeated scans of the index);
187+ *NULL for a plain index scan
186188 *
187189 * cost_index() takes an IndexPath not just a Path, because it sets a few
188190 * additional fields of the IndexPath besides startup_cost and total_cost.
200202cost_index (IndexPath * path ,PlannerInfo * root ,
201203IndexOptInfo * index ,
202204List * indexQuals ,
203- bool is_injoin )
205+ RelOptInfo * outer_rel )
204206{
205207RelOptInfo * baserel = index -> rel ;
206208Cost startup_cost = 0 ;
@@ -215,8 +217,6 @@ cost_index(IndexPath *path, PlannerInfo *root,
215217Cost cpu_per_tuple ;
216218double tuples_fetched ;
217219double pages_fetched ;
218- double T ,
219- b ;
220220
221221/* Should only be applied to base relations */
222222Assert (IsA (baserel ,RelOptInfo )&&
@@ -233,10 +233,11 @@ cost_index(IndexPath *path, PlannerInfo *root,
233233 * the fraction of main-table tuples we will have to retrieve) and its
234234 * correlation to the main-table tuple order.
235235 */
236- OidFunctionCall7 (index -> amcostestimate ,
236+ OidFunctionCall8 (index -> amcostestimate ,
237237PointerGetDatum (root ),
238238PointerGetDatum (index ),
239239PointerGetDatum (indexQuals ),
240+ PointerGetDatum (outer_rel ),
240241PointerGetDatum (& indexStartupCost ),
241242PointerGetDatum (& indexTotalCost ),
242243PointerGetDatum (& indexSelectivity ),
@@ -254,86 +255,75 @@ cost_index(IndexPath *path, PlannerInfo *root,
254255startup_cost += indexStartupCost ;
255256run_cost += indexTotalCost - indexStartupCost ;
256257
258+ /* estimate number of main-table tuples fetched */
259+ tuples_fetched = clamp_row_est (indexSelectivity * baserel -> tuples );
260+
257261/*----------
258- * Estimate number of main-tabletuples andpages fetched .
262+ * Estimate number of main-tablepages fetched, andcompute I/O cost .
259263 *
260264 * When the index ordering is uncorrelated with the table ordering,
261- * we use an approximation proposed by Mackert and Lohman, "Index Scans
262- * Using a Finite LRU Buffer: A Validated I/O Model", ACM Transactions
263- * on Database Systems, Vol. 14, No. 3, September 1989, Pages 401-424.
264- * The Mackert and Lohman approximation is that the number of pages
265- * fetched is
266- *PF =
267- *min(2TNs/(2T+Ns), T)when T <= b
268- *2TNs/(2T+Ns)when T > b and Ns <= 2Tb/(2T-b)
269- *b + (Ns - 2Tb/(2T-b))*(T-b)/Twhen T > b and Ns > 2Tb/(2T-b)
270- * where
271- *T = # pages in table
272- *N = # tuples in table
273- *s = selectivity = fraction of table to be scanned
274- *b = # buffer pages available (we include kernel space here)
265+ * we use an approximation proposed by Mackert and Lohman (see
266+ * index_pages_fetched() for details) to compute the number of pages
267+ * fetched, and then charge random_page_cost per page fetched.
275268 *
276269 * When the index ordering is exactly correlated with the table ordering
277270 * (just after a CLUSTER, for example), the number of pages fetched should
278- * bejust sT. What's more,these will be sequential fetches, not the
279- * random fetches that occur in the uncorrelated case.So, depending on
280- *the extent of correlation, we should estimate the actual I/O cost
281- *somewhere between s * T * seq_page_cost and PF * random_page_cost.
282- * We currently interpolate linearly between these two endpoints based on
283- *the correlation squared (XXX is that appropriate?).
284- *
285- *In any case thenumber of tuples fetched isNs .
271+ * beexactly selectivity * table_size. What's more,all but the first
272+ *will be sequential fetches, not the random fetches that occur in the
273+ *uncorrelated case. So if the number of pages is more than 1, we
274+ *ought to charge
275+ *random_page_cost + (pages_fetched - 1) * seq_page_cost
276+ *For partially-correlated indexes, we ought to charge somewhere between
277+ * these two estimates. We currently interpolate linearly between the
278+ *estimates based on thecorrelation squared (XXX isthat appropriate?) .
286279 *----------
287280 */
281+ if (outer_rel != NULL && outer_rel -> rows > 1 )
282+ {
283+ /*
284+ * For repeated indexscans, scale up the number of tuples fetched
285+ * in the Mackert and Lohman formula by the number of scans, so
286+ * that we estimate the number of pages fetched by all the scans.
287+ * Then pro-rate the costs for one scan. In this case we assume
288+ * all the fetches are random accesses. XXX it'd be good to
289+ * include correlation in this model, but it's not clear how to do
290+ * that without double-counting cache effects.
291+ */
292+ double num_scans = outer_rel -> rows ;
288293
289- tuples_fetched = clamp_row_est ( indexSelectivity * baserel -> tuples );
290-
291- /* This part is the Mackert and Lohman formula */
294+ pages_fetched = index_pages_fetched ( tuples_fetched * num_scans ,
295+ baserel -> pages ,
296+ index -> pages );
292297
293- T = (baserel -> pages > 1 ) ? (double )baserel -> pages :1.0 ;
294- b = (effective_cache_size > 1 ) ?effective_cache_size :1.0 ;
295-
296- if (T <=b )
297- {
298- pages_fetched =
299- (2.0 * T * tuples_fetched ) / (2.0 * T + tuples_fetched );
300- if (pages_fetched >=T )
301- pages_fetched = T ;
302- else
303- pages_fetched = ceil (pages_fetched );
298+ run_cost += (pages_fetched * random_page_cost ) /num_scans ;
304299}
305300else
306301{
307- double lim ;
302+ /*
303+ * Normal case: apply the Mackert and Lohman formula, and then
304+ * interpolate between that and the correlation-derived result.
305+ */
306+ pages_fetched = index_pages_fetched (tuples_fetched ,
307+ baserel -> pages ,
308+ index -> pages );
308309
309- lim = (2.0 * T * b ) / (2.0 * T - b );
310- if (tuples_fetched <=lim )
311- {
312- pages_fetched =
313- (2.0 * T * tuples_fetched ) / (2.0 * T + tuples_fetched );
314- }
315- else
316- {
317- pages_fetched =
318- b + (tuples_fetched - lim )* (T - b ) /T ;
319- }
320- pages_fetched = ceil (pages_fetched );
321- }
310+ /* max_IO_cost is for the perfectly uncorrelated case (csquared=0) */
311+ max_IO_cost = pages_fetched * random_page_cost ;
322312
323- /*
324- * min_IO_cost corresponds to the perfectly correlated case (csquared=1),
325- * max_IO_cost to the perfectly uncorrelated case (csquared=0).
326- */
327- min_IO_cost = ceil (indexSelectivity * T )* seq_page_cost ;
328- max_IO_cost = pages_fetched * random_page_cost ;
313+ /* min_IO_cost is for the perfectly correlated case (csquared=1) */
314+ pages_fetched = ceil (indexSelectivity * (double )baserel -> pages );
315+ min_IO_cost = random_page_cost ;
316+ if (pages_fetched > 1 )
317+ min_IO_cost += (pages_fetched - 1 )* seq_page_cost ;
329318
330- /*
331- * Now interpolate based on estimated index order correlation to get total
332- * disk I/O cost for main table accesses.
333- */
334- csquared = indexCorrelation * indexCorrelation ;
319+ /*
320+ * Now interpolate based on estimated index order correlation to get
321+ * total disk I/O cost for main table accesses.
322+ */
323+ csquared = indexCorrelation * indexCorrelation ;
335324
336- run_cost += max_IO_cost + csquared * (min_IO_cost - max_IO_cost );
325+ run_cost += max_IO_cost + csquared * (min_IO_cost - max_IO_cost );
326+ }
337327
338328/*
339329 * Estimate CPU costs per tuple.
@@ -348,7 +338,7 @@ cost_index(IndexPath *path, PlannerInfo *root,
348338startup_cost += baserel -> baserestrictcost .startup ;
349339cpu_per_tuple = cpu_tuple_cost + baserel -> baserestrictcost .per_tuple ;
350340
351- if (! is_injoin )
341+ if (outer_rel == NULL )
352342{
353343QualCost index_qual_cost ;
354344
@@ -363,19 +353,102 @@ cost_index(IndexPath *path, PlannerInfo *root,
363353path -> path .total_cost = startup_cost + run_cost ;
364354}
365355
356+ /*
357+ * index_pages_fetched
358+ * Estimate the number of pages actually fetched after accounting for
359+ * cache effects.
360+ *
361+ * We use an approximation proposed by Mackert and Lohman, "Index Scans
362+ * Using a Finite LRU Buffer: A Validated I/O Model", ACM Transactions
363+ * on Database Systems, Vol. 14, No. 3, September 1989, Pages 401-424.
364+ * The Mackert and Lohman approximation is that the number of pages
365+ * fetched is
366+ *PF =
367+ *min(2TNs/(2T+Ns), T)when T <= b
368+ *2TNs/(2T+Ns)when T > b and Ns <= 2Tb/(2T-b)
369+ *b + (Ns - 2Tb/(2T-b))*(T-b)/Twhen T > b and Ns > 2Tb/(2T-b)
370+ * where
371+ *T = # pages in table
372+ *N = # tuples in table
373+ *s = selectivity = fraction of table to be scanned
374+ *b = # buffer pages available (we include kernel space here)
375+ *
376+ * We assume that effective_cache_size is the total number of buffer pages
377+ * available for both table and index, and pro-rate that space between the
378+ * table and index. (Ideally other_pages should include all the other
379+ * tables and indexes used by the query too; but we don't have a good way
380+ * to get that number here.)
381+ *
382+ * The product Ns is the number of tuples fetched; we pass in that
383+ * product rather than calculating it here.
384+ *
385+ * Caller is expected to have ensured that tuples_fetched is greater than zero
386+ * and rounded to integer (see clamp_row_est). The result will likewise be
387+ * greater than zero and integral.
388+ */
389+ double
390+ index_pages_fetched (double tuples_fetched ,BlockNumber pages ,
391+ BlockNumber other_pages )
392+ {
393+ double pages_fetched ;
394+ double T ,
395+ b ;
396+
397+ /* T is # pages in table, but don't allow it to be zero */
398+ T = (pages > 1 ) ? (double )pages :1.0 ;
399+
400+ /* b is pro-rated share of effective_cache_size */
401+ b = effective_cache_size * T / (T + (double )other_pages );
402+ /* force it positive and integral */
403+ if (b <=1.0 )
404+ b = 1.0 ;
405+ else
406+ b = ceil (b );
407+
408+ /* This part is the Mackert and Lohman formula */
409+ if (T <=b )
410+ {
411+ pages_fetched =
412+ (2.0 * T * tuples_fetched ) / (2.0 * T + tuples_fetched );
413+ if (pages_fetched >=T )
414+ pages_fetched = T ;
415+ else
416+ pages_fetched = ceil (pages_fetched );
417+ }
418+ else
419+ {
420+ double lim ;
421+
422+ lim = (2.0 * T * b ) / (2.0 * T - b );
423+ if (tuples_fetched <=lim )
424+ {
425+ pages_fetched =
426+ (2.0 * T * tuples_fetched ) / (2.0 * T + tuples_fetched );
427+ }
428+ else
429+ {
430+ pages_fetched =
431+ b + (tuples_fetched - lim )* (T - b ) /T ;
432+ }
433+ pages_fetched = ceil (pages_fetched );
434+ }
435+ return pages_fetched ;
436+ }
437+
366438/*
367439 * cost_bitmap_heap_scan
368440 * Determines and returns the cost of scanning a relation using a bitmap
369441 * index-then-heap plan.
370442 *
371443 * 'baserel' is the relation to be scanned
372444 * 'bitmapqual' is a tree of IndexPaths, BitmapAndPaths, and BitmapOrPaths
373- * 'is_injoin' is T if we are considering using the scan as the inside
374- *of a nestloop join (hence, some of the quals are join clauses)
445+ *
446+ * Note: we take no explicit notice here of whether this is a join inner path.
447+ * If it is, the component IndexPaths should have been costed accordingly.
375448 */
376449void
377450cost_bitmap_heap_scan (Path * path ,PlannerInfo * root ,RelOptInfo * baserel ,
378- Path * bitmapqual , bool is_injoin )
451+ Path * bitmapqual )
379452{
380453Cost startup_cost = 0 ;
381454Cost run_cost = 0 ;