@@ -1335,25 +1335,40 @@ cost_material(Path *path,
13351335 *Determines and returns the cost of performing an Agg plan node,
13361336 *including the cost of its input.
13371337 *
1338+ * aggcosts can be NULL when there are no actual aggregate functions (i.e.,
1339+ * we are using a hashed Agg node just to do grouping).
1340+ *
13381341 * Note: when aggstrategy == AGG_SORTED, caller must ensure that input costs
13391342 * are for appropriately-sorted input.
13401343 */
13411344void
13421345cost_agg (Path * path ,PlannerInfo * root ,
1343- AggStrategy aggstrategy ,int numAggs ,
1346+ AggStrategy aggstrategy ,const AggClauseCosts * aggcosts ,
13441347int numGroupCols ,double numGroups ,
13451348Cost input_startup_cost ,Cost input_total_cost ,
13461349double input_tuples )
13471350{
13481351Cost startup_cost ;
13491352Cost total_cost ;
1353+ AggClauseCosts dummy_aggcosts ;
1354+
1355+ /* Use all-zero per-aggregate costs if NULL is passed */
1356+ if (aggcosts == NULL )
1357+ {
1358+ Assert (aggstrategy == AGG_HASHED );
1359+ MemSet (& dummy_aggcosts ,0 ,sizeof (AggClauseCosts ));
1360+ aggcosts = & dummy_aggcosts ;
1361+ }
13501362
13511363/*
1352- * We charge one cpu_operator_cost per aggregate function per input tuple,
1353- * and another one per output tuple (corresponding to transfn and finalfn
1354- * calls respectively). If we are grouping, we charge an additional
1355- * cpu_operator_cost per grouping column per input tuple for grouping
1356- * comparisons.
1364+ * The transCost.per_tuple component of aggcosts should be charged once
1365+ * per input tuple, corresponding to the costs of evaluating the aggregate
1366+ * transfns and their input expressions (with any startup cost of course
1367+ * charged but once). The finalCost component is charged once per output
1368+ * tuple, corresponding to the costs of evaluating the finalfns.
1369+ *
1370+ * If we are grouping, we charge an additional cpu_operator_cost per
1371+ * grouping column per input tuple for grouping comparisons.
13571372 *
13581373 * We will produce a single output tuple if not grouping, and a tuple per
13591374 * group otherwise. We charge cpu_tuple_cost for each output tuple.
@@ -1366,15 +1381,13 @@ cost_agg(Path *path, PlannerInfo *root,
13661381 * there's roundoff error we might do the wrong thing. So be sure that
13671382 * the computations below form the same intermediate values in the same
13681383 * order.
1369- *
1370- * Note: ideally we should use the pg_proc.procost costs of each
1371- * aggregate's component functions, but for now that seems like an
1372- * excessive amount of work.
13731384 */
13741385if (aggstrategy == AGG_PLAIN )
13751386{
13761387startup_cost = input_total_cost ;
1377- startup_cost += cpu_operator_cost * (input_tuples + 1 )* numAggs ;
1388+ startup_cost += aggcosts -> transCost .startup ;
1389+ startup_cost += aggcosts -> transCost .per_tuple * input_tuples ;
1390+ startup_cost += aggcosts -> finalCost ;
13781391/* we aren't grouping */
13791392total_cost = startup_cost + cpu_tuple_cost ;
13801393}
@@ -1384,19 +1397,21 @@ cost_agg(Path *path, PlannerInfo *root,
13841397startup_cost = input_startup_cost ;
13851398total_cost = input_total_cost ;
13861399/* calcs phrased this way to match HASHED case, see note above */
1387- total_cost += cpu_operator_cost * input_tuples * numGroupCols ;
1388- total_cost += cpu_operator_cost * input_tuples * numAggs ;
1389- total_cost += cpu_operator_cost * numGroups * numAggs ;
1400+ total_cost += aggcosts -> transCost .startup ;
1401+ total_cost += aggcosts -> transCost .per_tuple * input_tuples ;
1402+ total_cost += (cpu_operator_cost * numGroupCols )* input_tuples ;
1403+ total_cost += aggcosts -> finalCost * numGroups ;
13901404total_cost += cpu_tuple_cost * numGroups ;
13911405}
13921406else
13931407{
13941408/* must be AGG_HASHED */
13951409startup_cost = input_total_cost ;
1396- startup_cost += cpu_operator_cost * input_tuples * numGroupCols ;
1397- startup_cost += cpu_operator_cost * input_tuples * numAggs ;
1410+ startup_cost += aggcosts -> transCost .startup ;
1411+ startup_cost += aggcosts -> transCost .per_tuple * input_tuples ;
1412+ startup_cost += (cpu_operator_cost * numGroupCols )* input_tuples ;
13981413total_cost = startup_cost ;
1399- total_cost += cpu_operator_cost * numGroups * numAggs ;
1414+ total_cost += aggcosts -> finalCost * numGroups ;
14001415total_cost += cpu_tuple_cost * numGroups ;
14011416}
14021417
@@ -1413,25 +1428,53 @@ cost_agg(Path *path, PlannerInfo *root,
14131428 */
14141429void
14151430cost_windowagg (Path * path ,PlannerInfo * root ,
1416- int numWindowFuncs ,int numPartCols ,int numOrderCols ,
1431+ List * windowFuncs ,int numPartCols ,int numOrderCols ,
14171432Cost input_startup_cost ,Cost input_total_cost ,
14181433double input_tuples )
14191434{
14201435Cost startup_cost ;
14211436Cost total_cost ;
1437+ ListCell * lc ;
14221438
14231439startup_cost = input_startup_cost ;
14241440total_cost = input_total_cost ;
14251441
14261442/*
1427- * We charge one cpu_operator_cost per window function per tuple (often a
1428- * drastic underestimate, but without a way to gauge how many tuples the
1429- * window function will fetch, it's hard to do better). We also charge
1430- * cpu_operator_cost per grouping column per tuple for grouping
1431- * comparisons, plus cpu_tuple_cost per tuple for general overhead.
1432- */
1433- total_cost += cpu_operator_cost * input_tuples * numWindowFuncs ;
1434- total_cost += cpu_operator_cost * input_tuples * (numPartCols + numOrderCols );
1443+ * Window functions are assumed to cost their stated execution cost, plus
1444+ * the cost of evaluating their input expressions, per tuple. Since they
1445+ * may in fact evaluate their inputs at multiple rows during each cycle,
1446+ * this could be a drastic underestimate; but without a way to know how
1447+ * many rows the window function will fetch, it's hard to do better. In
1448+ * any case, it's a good estimate for all the built-in window functions,
1449+ * so we'll just do this for now.
1450+ */
1451+ foreach (lc ,windowFuncs )
1452+ {
1453+ WindowFunc * wfunc = (WindowFunc * )lfirst (lc );
1454+ Cost wfunccost ;
1455+ QualCost argcosts ;
1456+
1457+ Assert (IsA (wfunc ,WindowFunc ));
1458+
1459+ wfunccost = get_func_cost (wfunc -> winfnoid )* cpu_operator_cost ;
1460+
1461+ /* also add the input expressions' cost to per-input-row costs */
1462+ cost_qual_eval_node (& argcosts , (Node * )wfunc -> args ,root );
1463+ startup_cost += argcosts .startup ;
1464+ wfunccost += argcosts .per_tuple ;
1465+
1466+ total_cost += wfunccost * input_tuples ;
1467+ }
1468+
1469+ /*
1470+ * We also charge cpu_operator_cost per grouping column per tuple for
1471+ * grouping comparisons, plus cpu_tuple_cost per tuple for general
1472+ * overhead.
1473+ *
1474+ * XXX this neglects costs of spooling the data to disk when it overflows
1475+ * work_mem. Sooner or later that should get accounted for.
1476+ */
1477+ total_cost += cpu_operator_cost * (numPartCols + numOrderCols )* input_tuples ;
14351478total_cost += cpu_tuple_cost * input_tuples ;
14361479
14371480path -> startup_cost = startup_cost ;
@@ -2640,17 +2683,12 @@ cost_qual_eval_walker(Node *node, cost_qual_eval_context *context)
26402683 * Vars and Consts are charged zero, and so are boolean operators (AND,
26412684 * OR, NOT). Simplistic, but a lot better than no model at all.
26422685 *
2643- * Note that Aggref and WindowFunc nodes are (and should be) treated like
2644- * Vars --- whatever execution cost they have is absorbed into
2645- * plan-node-specific costing.As far as expression evaluation is
2646- * concerned they're just like Vars.
2647- *
26482686 * Should we try to account for the possibility of short-circuit
26492687 * evaluation of AND/OR? Probably *not*, because that would make the
26502688 * results depend on the clause ordering, and we are not in any position
26512689 * to expect that the current ordering of the clauses is the one that's
2652- * going to end up being used.(Is it worth applying order_qual_clauses
2653- *much earlier in the planning process tofix this?)
2690+ * going to end up being used. The above per-RestrictInfo caching would
2691+ *not mix well with trying tore-order clauses anyway.
26542692 */
26552693if (IsA (node ,FuncExpr ))
26562694{
@@ -2679,6 +2717,20 @@ cost_qual_eval_walker(Node *node, cost_qual_eval_context *context)
26792717context -> total .per_tuple += get_func_cost (saop -> opfuncid )*
26802718cpu_operator_cost * estimate_array_length (arraynode )* 0.5 ;
26812719}
2720+ else if (IsA (node ,Aggref )||
2721+ IsA (node ,WindowFunc ))
2722+ {
2723+ /*
2724+ * Aggref and WindowFunc nodes are (and should be) treated like Vars,
2725+ * ie, zero execution cost in the current model, because they behave
2726+ * essentially like Vars in execQual.c. We disregard the costs of
2727+ * their input expressions for the same reason. The actual execution
2728+ * costs of the aggregate/window functions and their arguments have to
2729+ * be factored into plan-node-specific costing of the Agg or WindowAgg
2730+ * plan node.
2731+ */
2732+ return false;/* don't recurse into children */
2733+ }
26822734else if (IsA (node ,CoerceViaIO ))
26832735{
26842736CoerceViaIO * iocoerce = (CoerceViaIO * )node ;