NotificationsYou must be signed in to change notification settings
Fork6
Star31

Commit2686ee1

committed

Collect and use multi-column dependency stats

Follow on patch in the multi-variate statistics patch series.CREATE STATISTICS s1 WITH (dependencies) ON (a, b) FROM t;ANALYZE;will collect dependency stats on (a, b) and then use the measureddependency in subsequent query planning.Commit7b504eb addedCREATE STATISTICS with n-distinct coefficients. These are nowspecified using the mutually exclusive option WITH (ndistinct).Author: Tomas Vondra, David RowleyReviewed-by: Kyotaro HORIGUCHI, Álvaro Herrera, Dean Rasheed, Robert Haasand many other comments and contributionsDiscussion:https://postgr.es/m/56f40b20-c464-fad2-ff39-06b668fac47c@2ndquadrant.com

1 parent00b6b6f commit2686ee1Copy full SHA for 2686ee1

File tree

31 files changed

+2035

-79

lines changed

contrib
- file_fdw
  - file_fdw.c
- postgres_fdw
  - postgres_fdw.c
doc/src/sgml
src
- backend
  - catalog
    - system_views.sql
  - commands
    - statscmds.c
  - optimizer
    - path
      - clausesel.c
      - costsize.c
    - util
      - orclauses.c
      - plancat.c
  - statistics
  - utils/adt
    - ruleutils.c
    - selfuncs.c
- bin/psql
  - describe.c
- include
  - catalog
  - optimizer
    - cost.h
  - statistics
    - extended_stats_internal.h
    - statistics.h
- test/regress
  - expected
  - sql
    - stats_ext.sql

31 files changed

+2035

-79

lines changed

`‎contrib/file_fdw/file_fdw.c`

Lines changed: 1 addition & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -1013,6 +1013,7 @@ estimate_size(PlannerInfo root, RelOptInfo baserel,`
`1013`	`1013`	`baserel->baserestrictinfo,`
`1014`	`1014`	`0,`
`1015`	`1015`	`JOIN_INNER,`
	`1016`	`+NULL,`
`1016`	`1017`	`NULL);`
`1017`	`1018`
`1018`	`1019`	`nrows=clamp_row_est(nrows);`

`‎contrib/postgres_fdw/postgres_fdw.c`

Lines changed: 4 additions & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -591,6 +591,7 @@ postgresGetForeignRelSize(PlannerInfo *root,`
`591`	`591`	`fpinfo->local_conds,`
`592`	`592`	`baserel->relid,`
`593`	`593`	`JOIN_INNER,`
	`594`	`+NULL,`
`594`	`595`	`NULL);`
`595`	`596`
`596`	`597`	`cost_qual_eval(&fpinfo->local_conds_cost,fpinfo->local_conds,root);`
`@@ -2572,6 +2573,7 @@ estimate_path_cost_size(PlannerInfo *root,`
`2572`	`2573`	`local_param_join_conds,`
`2573`	`2574`	`foreignrel->relid,`
`2574`	`2575`	`JOIN_INNER,`
	`2576`	`+NULL,`
`2575`	`2577`	`NULL);`
`2576`	`2578`	`local_sel *=fpinfo->local_conds_sel;`
`2577`	`2579`
`@@ -4455,6 +4457,7 @@ postgresGetForeignJoinPaths(PlannerInfo *root,`
`4455`	`4457`	`fpinfo->local_conds,`
`4456`	`4458`	`0,`
`4457`	`4459`	`JOIN_INNER,`
	`4460`	`+NULL,`
`4458`	`4461`	`NULL);`
`4459`	`4462`	`cost_qual_eval(&fpinfo->local_conds_cost,fpinfo->local_conds,root);`
`4460`	`4463`
`@@ -4465,7 +4468,7 @@ postgresGetForeignJoinPaths(PlannerInfo *root,`
`4465`	`4468`	`if (!fpinfo->use_remote_estimate)`
`4466`	`4469`	`fpinfo->joinclause_sel=clauselist_selectivity(root,fpinfo->joinclauses,`
`4467`	`4470`	`0,fpinfo->jointype,`
`4468`		`-extra->sjinfo);`
	`4471`	`+extra->sjinfo,NULL);`
`4469`	`4472`
`4470`	`4473`	`/* Estimate costs for bare join relation */`
`4471`	`4474`	`estimate_path_cost_size(root,joinrel,NIL,NIL,&rows,`

`‎doc/src/sgml/catalogs.sgml`

Lines changed: 9 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -4339,6 +4339,15 @@`
`4339`	`4339`	`</entry>`
`4340`	`4340`	`</row>`
`4341`	`4341`
	`4342`	`+ <row>`
	`4343`	`+ <entry><structfield>stadependencies</structfield></entry>`
	`4344`	`+ <entry><type>pg_dependencies</type></entry>`
	`4345`	`+ <entry></entry>`
	`4346`	`+ <entry>`
	`4347`	`+ Functional dependencies, serialized as <structname>pg_dependencies</> type.`
	`4348`	`+ </entry>`
	`4349`	`+ </row>`
	`4350`	`+`
`4342`	`4351`	`</tbody>`
`4343`	`4352`	`</tgroup>`
`4344`	`4353`	`</table>`

`‎doc/src/sgml/planstats.sgml`

Lines changed: 154 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -446,6 +446,160 @@ rows = (outer_cardinality * inner_cardinality) * selectivity`
`446`	`446`	`in <filename>src/backend/utils/adt/selfuncs.c</filename>.`
`447`	`447`	`</para>`
`448`	`448`
	`449`	`+ <sect2 id="functional-dependencies">`
	`450`	`+ <title>Functional Dependencies</title>`
	`451`	`+`
	`452`	`+ <para>`
	`453`	`+ The simplest type of extended statistics are functional dependencies,`
	`454`	`+ used in definitions of database normal forms. When simplified, saying that`
	`455`	`+ <literal>b</> is functionally dependent on <literal>a</> means that`
	`456`	`+ knowledge of value of <literal>a</> is sufficient to determine value of`
	`457`	`+ <literal>b</>.`
	`458`	`+ </para>`
	`459`	`+`
	`460`	`+ <para>`
	`461`	`+ In normalized databases, only functional dependencies on primary keys`
	`462`	`+ and superkeys are allowed. However, in practice, many data sets are not`
	`463`	`+ fully normalized, for example, due to intentional denormalization for`
	`464`	`+ performance reasons.`
	`465`	`+ </para>`
	`466`	`+`
	`467`	`+ <para>`
	`468`	`+ Functional dependencies directly affect accuracy of the estimates, as`
	`469`	`+ conditions on the dependent column(s) do not restrict the result set,`
	`470`	`+ resulting in underestimates.`
	`471`	`+ </para>`
	`472`	`+`
	`473`	`+ <para>`
	`474`	`+ To inform the planner about the functional dependencies, we collect`
	`475`	`+ measurements of dependency during <command>ANALYZE</>. Assessing`
	`476`	`+ dependency between all sets of columns would be prohibitively`
	`477`	`+ expensive, so we limit our search to potential dependencies defined`
	`478`	`+ using the <command>CREATE STATISTICS</> command.`
	`479`	`+`
	`480`	`+<programlisting>`
	`481`	`+CREATE TABLE t (a INT, b INT);`
	`482`	`+INSERT INTO t SELECT i/100, i/100 FROM generate_series(1,10000) s(i);`
	`483`	`+CREATE STATISTICS s1 WITH (dependencies) ON (a, b) FROM t;`
	`484`	`+ANALYZE t;`
	`485`	`+EXPLAIN ANALYZE SELECT * FROM t WHERE a = 1 AND b = 1;`
	`486`	`+ QUERY PLAN`
	`487`	`+-------------------------------------------------------------------------------------------------`
	`488`	`+ Seq Scan on t (cost=0.00..195.00 rows=100 width=8) (actual time=0.095..3.118 rows=100 loops=1)`
	`489`	`+ Filter: ((a = 1) AND (b = 1))`
	`490`	`+ Rows Removed by Filter: 9900`
	`491`	`+ Planning time: 0.367 ms`
	`492`	`+ Execution time: 3.380 ms`
	`493`	`+(5 rows)`
	`494`	`+</programlisting>`
	`495`	`+`
	`496`	`+ The planner is now aware of the functional dependencies and considers`
	`497`	`+ them when computing the selectivity of the second condition. Running`
	`498`	`+ the query without the statistics would lead to quite different estimates.`
	`499`	`+`
	`500`	`+<programlisting>`
	`501`	`+DROP STATISTICS s1;`
	`502`	`+EXPLAIN ANALYZE SELECT * FROM t WHERE a = 1 AND b = 1;`
	`503`	`+ QUERY PLAN`
	`504`	`+-----------------------------------------------------------------------------------------------`
	`505`	`+ Seq Scan on t (cost=0.00..195.00 rows=1 width=8) (actual time=0.000..6.379 rows=100 loops=1)`
	`506`	`+ Filter: ((a = 1) AND (b = 1))`
	`507`	`+ Rows Removed by Filter: 9900`
	`508`	`+ Planning time: 0.000 ms`
	`509`	`+ Execution time: 6.379 ms`
	`510`	`+(5 rows)`
	`511`	`+</programlisting>`
	`512`	`+ </para>`
	`513`	`+`
	`514`	`+ <para>`
	`515`	`+ If no dependency exists, the collected statistics do not influence the`
	`516`	`+ query plan. The only effect is to slow down <command>ANALYZE</>. Should`
	`517`	`+ partial dependencies exist these will also be stored and applied`
	`518`	`+ during planning.`
	`519`	`+ </para>`
	`520`	`+`
	`521`	`+ <para>`
	`522`	`+ Similarly to per-column statistics, extended statistics are stored in`
	`523`	`+ a system catalog called <structname>pg_statistic_ext</structname>, but`
	`524`	`+ there is also a more convenient view <structname>pg_stats_ext</structname>.`
	`525`	`+ To inspect the statistics <literal>s1</literal> defined above,`
	`526`	`+ you may do this:`
	`527`	`+`
	`528`	`+<programlisting>`
	`529`	`+SELECT tablename, staname, attnums, depsbytes`
	`530`	`+ FROM pg_stats_ext WHERE staname = 's1';`
	`531`	`+ tablename \| staname \| attnums \| depsbytes`
	`532`	`+-----------+---------+---------+-----------`
	`533`	`+ t \| s1 \| 1 2 \| 40`
	`534`	`+(1 row)`
	`535`	`+</programlisting>`
	`536`	`+`
	`537`	`+ This shows that the statistics are defined on table <structname>t</>,`
	`538`	`+ <structfield>attnums</structfield> lists attribute numbers of columns`
	`539`	`+ (references <structname>pg_attribute</structname>). It also shows`
	`540`	`+ the length in bytes of the functional dependencies, as found by`
	`541`	`+ <command>ANALYZE</> when serialized into a <literal>bytea</> column.`
	`542`	`+ </para>`
	`543`	`+`
	`544`	`+ <para>`
	`545`	`+ When computing the selectivity, the planner inspects all conditions and`
	`546`	`+ attempts to identify which conditions are already implied by other`
	`547`	`+ conditions. The selectivity estimates from any redundant conditions are`
	`548`	`+ ignored from a selectivity point of view. In the example query above,`
	`549`	`+ the selectivity estimates for either of the conditions may be eliminated,`
	`550`	`+ thus improving the overall estimate.`
	`551`	`+ </para>`
	`552`	`+`
	`553`	`+ <sect3 id="functional-dependencies-limitations">`
	`554`	`+ <title>Limitations of functional dependencies</title>`
	`555`	`+`
	`556`	`+ <para>`
	`557`	`+ Functional dependencies are a very simple type of statistics, and`
	`558`	`+ as such have several limitations. The first limitation is that they`
	`559`	`+ only work with simple equality conditions, comparing columns and constant`
	`560`	`+ values. It's not possible to use them to eliminate equality conditions`
	`561`	`+ comparing two columns or a column to an expression, range clauses,`
	`562`	`+ <literal>LIKE</> or any other type of conditions.`
	`563`	`+ </para>`
	`564`	`+`
	`565`	`+ <para>`
	`566`	`+ When eliminating the implied conditions, the planner assumes that the`
	`567`	`+ conditions are compatible. Consider the following example, violating`
	`568`	`+ this assumption:`
	`569`	`+`
	`570`	`+<programlisting>`
	`571`	`+EXPLAIN ANALYZE SELECT * FROM t WHERE a = 1 AND b = 10;`
	`572`	`+ QUERY PLAN`
	`573`	`+-----------------------------------------------------------------------------------------------`
	`574`	`+ Seq Scan on t (cost=0.00..195.00 rows=100 width=8) (actual time=2.992..2.992 rows=0 loops=1)`
	`575`	`+ Filter: ((a = 1) AND (b = 10))`
	`576`	`+ Rows Removed by Filter: 10000`
	`577`	`+ Planning time: 0.232 ms`
	`578`	`+ Execution time: 3.033 ms`
	`579`	`+(5 rows)`
	`580`	`+</programlisting>`
	`581`	`+`
	`582`	`+ While there are no rows with such combination of values, the planner`
	`583`	`+ is unable to verify whether the values match - it only knows that`
	`584`	`+ the columns are functionally dependent.`
	`585`	`+ </para>`
	`586`	`+`
	`587`	`+ <para>`
	`588`	`+ This assumption is more about queries executed on the database - in many`
	`589`	`+ cases, it's actually satisfied (e.g. when the GUI only allows selecting`
	`590`	`+ compatible values). But if that's not the case, functional dependencies`
	`591`	`+ may not be a viable option.`
	`592`	`+ </para>`
	`593`	`+`
	`594`	`+ <para>`
	`595`	`+ For additional information about functional dependencies, see`
	`596`	`+ <filename>src/backend/statistics/README.dependencies</>.`
	`597`	`+ </para>`
	`598`	`+`
	`599`	`+ </sect3>`
	`600`	`+`
	`601`	`+ </sect2>`
	`602`	`+`
`449`	`603`	`</sect1>`
`450`	`604`
`451`	`605`	`</chapter>`

`‎doc/src/sgml/ref/create_statistics.sgml`

Lines changed: 39 additions & 3 deletions

Original file line number	Diff line number	Diff line change
`@@ -21,8 +21,9 @@ PostgreSQL documentation`
`21`	`21`
`22`	`22`	`<refsynopsisdiv>`
`23`	`23`	`<synopsis>`
`24`		`-CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="PARAMETER">statistics_name</replaceable> ON (`
`25`		`- <replaceable class="PARAMETER">column_name</replaceable>, <replaceable class="PARAMETER">column_name</replaceable> [, ...])`
	`24`	`+CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="PARAMETER">statistics_name</replaceable>`
	`25`	`+ WITH ( <replaceable class="PARAMETER">option</replaceable> [= <replaceable class="PARAMETER">value</replaceable>] [, ... ] )`
	`26`	`+ ON ( <replaceable class="PARAMETER">column_name</replaceable>, <replaceable class="PARAMETER">column_name</replaceable> [, ...])`
`26`	`27`	`FROM <replaceable class="PARAMETER">table_name</replaceable>`
`27`	`28`	`</synopsis>`
`28`	`29`
`@@ -94,6 +95,41 @@ CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="PARAMETER">statistics_na`
`94`	`95`
`95`	`96`	`</variablelist>`
`96`	`97`
	`98`	`+ <refsect2 id="SQL-CREATESTATISTICS-parameters">`
	`99`	`+ <title id="SQL-CREATESTATISTICS-parameters-title">Parameters</title>`
	`100`	`+`
	`101`	`+ <indexterm zone="sql-createstatistics-parameters">`
	`102`	`+ <primary>statistics parameters</primary>`
	`103`	`+ </indexterm>`
	`104`	`+`
	`105`	`+ <para>`
	`106`	`+ The <literal>WITH</> clause can specify <firstterm>options</>`
	`107`	`+ for the statistics. Available options are listed below.`
	`108`	`+ </para>`
	`109`	`+`
	`110`	`+ <variablelist>`
	`111`	`+`
	`112`	`+ <varlistentry>`
	`113`	`+ <term><literal>dependencies</> (<type>boolean</>)</term>`
	`114`	`+ <listitem>`
	`115`	`+ <para>`
	`116`	`+ Enables functional dependencies for the statistics.`
	`117`	`+ </para>`
	`118`	`+ </listitem>`
	`119`	`+ </varlistentry>`
	`120`	`+`
	`121`	`+ <varlistentry>`
	`122`	`+ <term><literal>ndistinct</> (<type>boolean</>)</term>`
	`123`	`+ <listitem>`
	`124`	`+ <para>`
	`125`	`+ Enables ndistinct coefficients for the statistics.`
	`126`	`+ </para>`
	`127`	`+ </listitem>`
	`128`	`+ </varlistentry>`
	`129`	`+`
	`130`	`+ </variablelist>`
	`131`	`+`
	`132`	`+ </refsect2>`
`97`	`133`	`</refsect1>`
`98`	`134`
`99`	`135`	`<refsect1>`
`@@ -122,7 +158,7 @@ CREATE TABLE t1 (`
`122`	`158`	`INSERT INTO t1 SELECT i/100, i/500`
`123`	`159`	`FROM generate_series(1,1000000) s(i);`
`124`	`160`
`125`		`-CREATE STATISTICS s1 ON (a, b) FROM t1;`
	`161`	`+CREATE STATISTICS s1WITH (dependencies)ON (a, b) FROM t1;`
`126`	`162`
`127`	`163`	`ANALYZE t1;`
`128`	`164`

`‎src/backend/catalog/system_views.sql`

Lines changed: 2 additions & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -192,7 +192,8 @@ CREATE VIEW pg_stats_ext AS`
`192`	`192`	`C.relnameAS tablename,`
`193`	`193`	`S.stanameAS staname,`
`194`	`194`	`S.stakeysAS attnums,`
`195`		`- length(s.standistinct)AS ndistbytes`
	`195`	`+ length(s.standistinct::bytea)AS ndistbytes,`
	`196`	`+ length(S.stadependencies::bytea)AS depsbytes`
`196`	`197`	`FROM (pg_statistic_ext SJOIN pg_class CON (C.oid=S.starelid))`
`197`	`198`	`LEFT JOIN pg_namespace NON (N.oid=C.relnamespace);`
`198`	`199`

`‎src/backend/commands/statscmds.c`

Lines changed: 15 additions & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -62,10 +62,11 @@ CreateStatistics(CreateStatsStmt *stmt)`
`62`	`62`	`Oidrelid;`
`63`	`63`	`ObjectAddressparentobject,`
`64`	`64`	`childobject;`
`65`		`-Datumtypes[1];/only ndistinct defined now /`
	`65`	`+Datumtypes[2];/one for each possible type of statistics /`
`66`	`66`	`intntypes;`
`67`	`67`	`ArrayType*staenabled;`
`68`	`68`	`boolbuild_ndistinct;`
	`69`	`+boolbuild_dependencies;`
`69`	`70`	`boolrequested_type= false;`
`70`	`71`
`71`	`72`	`Assert(IsA(stmt,CreateStatsStmt));`
`@@ -159,7 +160,7 @@ CreateStatistics(CreateStatsStmt *stmt)`
`159`	`160`	`errmsg("statistics require at least 2 columns")));`
`160`	`161`
`161`	`162`	`/*`
`162`		`- * Sort the attnums, which makes detectingduplicies somewhat easier, and`
	`163`	`+ * Sort the attnums, which makes detectingduplicities somewhat easier, and`
`163`	`164`	`* it does not hurt (it does not affect the efficiency, unlike for`
`164`	`165`	`* indexes, for example).`
`165`	`166`	`*/`
`@@ -182,6 +183,7 @@ CreateStatistics(CreateStatsStmt *stmt)`
`182`	`183`	`* recognized.`
`183`	`184`	`*/`
`184`	`185`	`build_ndistinct= false;`
	`186`	`+build_dependencies= false;`
`185`	`187`	`foreach(l,stmt->options)`
`186`	`188`	`{`
`187`	`189`	`DefElemopt= (DefElem)lfirst(l);`
`@@ -191,6 +193,11 @@ CreateStatistics(CreateStatsStmt *stmt)`
`191`	`193`	`build_ndistinct=defGetBoolean(opt);`
`192`	`194`	`requested_type= true;`
`193`	`195`	`}`
	`196`	`+elseif (strcmp(opt->defname,"dependencies")==0)`
	`197`	`+{`
	`198`	`+build_dependencies=defGetBoolean(opt);`
	`199`	`+requested_type= true;`
	`200`	`+}`
`194`	`201`	`else`
`195`	`202`	`ereport(ERROR,`
`196`	`203`	`(errcode(ERRCODE_SYNTAX_ERROR),`
`@@ -199,12 +206,17 @@ CreateStatistics(CreateStatsStmt *stmt)`
`199`	`206`	`}`
`200`	`207`	`/* If no statistic type was specified, build them all. */`
`201`	`208`	`if (!requested_type)`
	`209`	`+{`
`202`	`210`	`build_ndistinct= true;`
	`211`	`+build_dependencies= true;`
	`212`	`+}`
`203`	`213`
`204`	`214`	`/* construct the char array of enabled statistic types */`
`205`	`215`	`ntypes=0;`
`206`	`216`	`if (build_ndistinct)`
`207`	`217`	`types[ntypes++]=CharGetDatum(STATS_EXT_NDISTINCT);`
	`218`	`+if (build_dependencies)`
	`219`	`+types[ntypes++]=CharGetDatum(STATS_EXT_DEPENDENCIES);`
`208`	`220`	`Assert(ntypes>0);`
`209`	`221`	`staenabled=construct_array(types,ntypes,CHAROID,1, true,'c');`
`210`	`222`
`@@ -222,6 +234,7 @@ CreateStatistics(CreateStatsStmt *stmt)`
`222`	`234`
`223`	`235`	`/* no statistics build yet */`
`224`	`236`	`nulls[Anum_pg_statistic_ext_standistinct-1]= true;`
	`237`	`+nulls[Anum_pg_statistic_ext_stadependencies-1]= true;`
`225`	`238`
`226`	`239`	`/* insert it into pg_statistic_ext */`
`227`	`240`	`statrel=heap_open(StatisticExtRelationId,RowExclusiveLock);`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit2686ee1

File tree

31 files changed

31 files changed

`‎contrib/file_fdw/file_fdw.c`

`‎contrib/postgres_fdw/postgres_fdw.c`

`‎doc/src/sgml/catalogs.sgml`

`‎doc/src/sgml/planstats.sgml`

`‎doc/src/sgml/ref/create_statistics.sgml`

`‎src/backend/catalog/system_views.sql`

`‎src/backend/commands/statscmds.c`

0 commit comments