Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit5543677

Browse files
committed
Use Limit instead of Unique to implement DISTINCT, when possible
When all of the query's DISTINCT pathkeys have been marked as redundantdue to EquivalenceClasses existing which contain constants, we can justimplement the DISTINCT operation on a query by just limiting the number ofreturned rows to 1 instead of performing a Unique on all of the matching(duplicate) rows.This applies in cases such as:SELECT DISTINCT col,col2 FROM tab WHERE col = 1 AND col2 = 10;If there are any matching rows, then they must all be {1,10}. There's nopoint in fetching all of those and running a Unique operator on them toleave only a single row. Here we effectively just find the first row andthen stop. We are obviously unable to apply this optimization if eitherthe col = 1 or col2 = 10 were missing from the WHERE clause or if therewere any additional columns in the SELECT clause.Such queries are probably not all that common, but detecting when we canapply this optimization amounts to checking if the distinct_pathkeys areNULL, which is very cheap indeed.Nothing is done here to check if the query already has a LIMIT clause. Ifit does then the plan may end up with 2 Limits nodes. There's no harm inthat and it's probably not worth the complexity to unify them into asingle Limit node.Author: David RowleyReviewed-by: Richard GuoDiscussion:https://postgr.es/m/CAApHDvqS0j8RUWRUSgCAXxOqnYjHUXmKwspRj4GzVfOO25ByHA@mail.gmail.comDiscussion:https://postgr.es/m/MEYPR01MB7101CD5DA0A07C9DE2B74850A4239@MEYPR01MB7101.ausprd01.prod.outlook.com
1 parentb1099ec commit5543677

File tree

5 files changed

+221
-11
lines changed

5 files changed

+221
-11
lines changed

‎src/backend/optimizer/plan/planner.c

Lines changed: 66 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4780,11 +4780,46 @@ create_final_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel,
47804780

47814781
if (pathkeys_contained_in(needed_pathkeys,path->pathkeys))
47824782
{
4783-
add_path(distinct_rel, (Path*)
4784-
create_upper_unique_path(root,distinct_rel,
4785-
path,
4786-
list_length(root->distinct_pathkeys),
4787-
numDistinctRows));
4783+
/*
4784+
* distinct_pathkeys may have become empty if all of the
4785+
* pathkeys were determined to be redundant. If all of the
4786+
* pathkeys are redundant then each DISTINCT target must only
4787+
* allow a single value, therefore all resulting tuples must
4788+
* be identical (or at least indistinguishable by an equality
4789+
* check). We can uniquify these tuples simply by just taking
4790+
* the first tuple. All we do here is add a path to do "LIMIT
4791+
* 1" atop of 'path'. When doing a DISTINCT ON we may still
4792+
* have a non-NIL sort_pathkeys list, so we must still only do
4793+
* this with paths which are correctly sorted by
4794+
* sort_pathkeys.
4795+
*/
4796+
if (root->distinct_pathkeys==NIL)
4797+
{
4798+
Node*limitCount;
4799+
4800+
limitCount= (Node*)makeConst(INT8OID,-1,InvalidOid,
4801+
sizeof(int64),
4802+
Int64GetDatum(1), false,
4803+
FLOAT8PASSBYVAL);
4804+
4805+
/*
4806+
* If the query already has a LIMIT clause, then we could
4807+
* end up with a duplicate LimitPath in the final plan.
4808+
* That does not seem worth troubling over too much.
4809+
*/
4810+
add_path(distinct_rel, (Path*)
4811+
create_limit_path(root,distinct_rel,path,NULL,
4812+
limitCount,LIMIT_OPTION_COUNT,
4813+
0,1));
4814+
}
4815+
else
4816+
{
4817+
add_path(distinct_rel, (Path*)
4818+
create_upper_unique_path(root,distinct_rel,
4819+
path,
4820+
list_length(root->distinct_pathkeys),
4821+
numDistinctRows));
4822+
}
47884823
}
47894824
}
47904825

@@ -4805,13 +4840,33 @@ create_final_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel,
48054840
path= (Path*)create_sort_path(root,distinct_rel,
48064841
path,
48074842
needed_pathkeys,
4808-
-1.0);
4843+
root->distinct_pathkeys==NIL ?
4844+
1.0 :-1.0);
48094845

4810-
add_path(distinct_rel, (Path*)
4811-
create_upper_unique_path(root,distinct_rel,
4812-
path,
4813-
list_length(root->distinct_pathkeys),
4814-
numDistinctRows));
4846+
/*
4847+
* As above, use a LimitPath instead of a UniquePath when all of the
4848+
* distinct_pathkeys are redundant and we're only going to get a
4849+
* series of tuples all with the same values anyway.
4850+
*/
4851+
if (root->distinct_pathkeys==NIL)
4852+
{
4853+
Node*limitCount= (Node*)makeConst(INT8OID,-1,InvalidOid,
4854+
sizeof(int64),
4855+
Int64GetDatum(1), false,
4856+
FLOAT8PASSBYVAL);
4857+
4858+
add_path(distinct_rel, (Path*)
4859+
create_limit_path(root,distinct_rel,path,NULL,
4860+
limitCount,LIMIT_OPTION_COUNT,0,1));
4861+
}
4862+
else
4863+
{
4864+
add_path(distinct_rel, (Path*)
4865+
create_upper_unique_path(root,distinct_rel,
4866+
path,
4867+
list_length(root->distinct_pathkeys),
4868+
numDistinctRows));
4869+
}
48154870
}
48164871

48174872
/*

‎src/test/regress/expected/select_distinct.out

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,60 @@ RESET max_parallel_workers_per_gather;
279279
RESET min_parallel_table_scan_size;
280280
RESET parallel_setup_cost;
281281
RESET parallel_tuple_cost;
282+
--
283+
-- Test the planner's ability to use a LIMIT 1 instead of a Unique node when
284+
-- all of the distinct_pathkeys have been marked as redundant
285+
--
286+
-- Ensure we get a plan with a Limit 1
287+
EXPLAIN (COSTS OFF)
288+
SELECT DISTINCT four FROM tenk1 WHERE four = 0;
289+
QUERY PLAN
290+
----------------------------
291+
Limit
292+
-> Seq Scan on tenk1
293+
Filter: (four = 0)
294+
(3 rows)
295+
296+
-- Ensure the above gives us the correct result
297+
SELECT DISTINCT four FROM tenk1 WHERE four = 0;
298+
four
299+
------
300+
0
301+
(1 row)
302+
303+
-- Ensure we get a plan with a Limit 1
304+
EXPLAIN (COSTS OFF)
305+
SELECT DISTINCT four FROM tenk1 WHERE four = 0 AND two <> 0;
306+
QUERY PLAN
307+
---------------------------------------------
308+
Limit
309+
-> Seq Scan on tenk1
310+
Filter: ((two <> 0) AND (four = 0))
311+
(3 rows)
312+
313+
-- Ensure no rows are returned
314+
SELECT DISTINCT four FROM tenk1 WHERE four = 0 AND two <> 0;
315+
four
316+
------
317+
(0 rows)
318+
319+
-- Ensure we get a plan with a Limit 1 when the SELECT list contains constants
320+
EXPLAIN (COSTS OFF)
321+
SELECT DISTINCT four,1,2,3 FROM tenk1 WHERE four = 0;
322+
QUERY PLAN
323+
----------------------------
324+
Limit
325+
-> Seq Scan on tenk1
326+
Filter: (four = 0)
327+
(3 rows)
328+
329+
-- Ensure we only get 1 row
330+
SELECT DISTINCT four,1,2,3 FROM tenk1 WHERE four = 0;
331+
four | ?column? | ?column? | ?column?
332+
------+----------+----------+----------
333+
0 | 1 | 2 | 3
334+
(1 row)
335+
282336
--
283337
-- Also, some tests of IS DISTINCT FROM, which doesn't quite deserve its
284338
-- very own regression file.

‎src/test/regress/expected/select_distinct_on.out

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,3 +73,53 @@ select distinct on (1) floor(random()) as r, f1 from int4_tbl order by 1,2;
7373
0 | -2147483647
7474
(1 row)
7575

76+
--
77+
-- Test the planner's ability to use a LIMIT 1 instead of a Unique node when
78+
-- all of the distinct_pathkeys have been marked as redundant
79+
--
80+
-- Ensure we also get a LIMIT plan with DISTINCT ON
81+
EXPLAIN (COSTS OFF)
82+
SELECT DISTINCT ON (four) four,two
83+
FROM tenk1 WHERE four = 0 ORDER BY 1;
84+
QUERY PLAN
85+
----------------------------------
86+
Result
87+
-> Limit
88+
-> Seq Scan on tenk1
89+
Filter: (four = 0)
90+
(4 rows)
91+
92+
-- and check the result of the above query is correct
93+
SELECT DISTINCT ON (four) four,two
94+
FROM tenk1 WHERE four = 0 ORDER BY 1;
95+
four | two
96+
------+-----
97+
0 | 0
98+
(1 row)
99+
100+
-- Ensure a Sort -> Limit is used when the ORDER BY contains additional cols
101+
EXPLAIN (COSTS OFF)
102+
SELECT DISTINCT ON (four) four,two
103+
FROM tenk1 WHERE four = 0 ORDER BY 1,2;
104+
QUERY PLAN
105+
----------------------------------
106+
Limit
107+
-> Sort
108+
Sort Key: two
109+
-> Seq Scan on tenk1
110+
Filter: (four = 0)
111+
(5 rows)
112+
113+
-- Same again but use a column that is indexed so that we get an index scan
114+
-- then a limit
115+
EXPLAIN (COSTS OFF)
116+
SELECT DISTINCT ON (four) four,hundred
117+
FROM tenk1 WHERE four = 0 ORDER BY 1,2;
118+
QUERY PLAN
119+
-----------------------------------------------------
120+
Result
121+
-> Limit
122+
-> Index Scan using tenk1_hundred on tenk1
123+
Filter: (four = 0)
124+
(4 rows)
125+

‎src/test/regress/sql/select_distinct.sql

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,32 @@ RESET min_parallel_table_scan_size;
146146
RESET parallel_setup_cost;
147147
RESET parallel_tuple_cost;
148148

149+
--
150+
-- Test the planner's ability to use a LIMIT 1 instead of a Unique node when
151+
-- all of the distinct_pathkeys have been marked as redundant
152+
--
153+
154+
-- Ensure we get a plan with a Limit 1
155+
EXPLAIN (COSTS OFF)
156+
SELECT DISTINCT fourFROM tenk1WHERE four=0;
157+
158+
-- Ensure the above gives us the correct result
159+
SELECT DISTINCT fourFROM tenk1WHERE four=0;
160+
161+
-- Ensure we get a plan with a Limit 1
162+
EXPLAIN (COSTS OFF)
163+
SELECT DISTINCT fourFROM tenk1WHERE four=0AND two<>0;
164+
165+
-- Ensure no rows are returned
166+
SELECT DISTINCT fourFROM tenk1WHERE four=0AND two<>0;
167+
168+
-- Ensure we get a plan with a Limit 1 when the SELECT list contains constants
169+
EXPLAIN (COSTS OFF)
170+
SELECT DISTINCT four,1,2,3FROM tenk1WHERE four=0;
171+
172+
-- Ensure we only get 1 row
173+
SELECT DISTINCT four,1,2,3FROM tenk1WHERE four=0;
174+
149175
--
150176
-- Also, some tests of IS DISTINCT FROM, which doesn't quite deserve its
151177
-- very own regression file.

‎src/test/regress/sql/select_distinct_on.sql

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,3 +17,28 @@ SELECT DISTINCT ON (string4, ten) string4, ten, two
1717

1818
-- bug #5049: early 8.4.x chokes on volatile DISTINCT ON clauses
1919
select distincton (1) floor(random())as r, f1from int4_tblorder by1,2;
20+
21+
--
22+
-- Test the planner's ability to use a LIMIT 1 instead of a Unique node when
23+
-- all of the distinct_pathkeys have been marked as redundant
24+
--
25+
26+
-- Ensure we also get a LIMIT plan with DISTINCT ON
27+
EXPLAIN (COSTS OFF)
28+
SELECT DISTINCTON (four) four,two
29+
FROM tenk1WHERE four=0ORDER BY1;
30+
31+
-- and check the result of the above query is correct
32+
SELECT DISTINCTON (four) four,two
33+
FROM tenk1WHERE four=0ORDER BY1;
34+
35+
-- Ensure a Sort -> Limit is used when the ORDER BY contains additional cols
36+
EXPLAIN (COSTS OFF)
37+
SELECT DISTINCTON (four) four,two
38+
FROM tenk1WHERE four=0ORDER BY1,2;
39+
40+
-- Same again but use a column that is indexed so that we get an index scan
41+
-- then a limit
42+
EXPLAIN (COSTS OFF)
43+
SELECT DISTINCTON (four) four,hundred
44+
FROM tenk1WHERE four=0ORDER BY1,2;

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp