Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitf6d208d

Browse files
TABLESAMPLE, SQL Standard and extensible
Add a TABLESAMPLE clause to SELECT statements that allowsuser to specify random BERNOULLI sampling or block levelSYSTEM sampling. Implementation allows for extensiblesampling functions to be written, using a standard API.Basic version follows SQLStandard exactly. Usableconcrete use cases for the sampling API follow in latercommits.Petr JelinekReviewed by Michael Paquier and Simon Riggs
1 parent11a83bb commitf6d208d

File tree

66 files changed

+2756
-40
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

66 files changed

+2756
-40
lines changed

‎contrib/file_fdw/file_fdw.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1097,7 +1097,7 @@ file_acquire_sample_rows(Relation onerel, int elevel,
10971097
* Found a suitable tuple, so save it, replacing one old tuple
10981098
* at random
10991099
*/
1100-
intk= (int) (targrows*sampler_random_fract());
1100+
intk= (int) (targrows*sampler_random_fract(rstate.randstate));
11011101

11021102
Assert(k >=0&&k<targrows);
11031103
heap_freetuple(rows[k]);

‎contrib/postgres_fdw/postgres_fdw.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2557,7 +2557,7 @@ analyze_row_processor(PGresult *res, int row, PgFdwAnalyzeState *astate)
25572557
if (astate->rowstoskip <=0)
25582558
{
25592559
/* Choose a random reservoir element to replace. */
2560-
pos= (int) (targrows*sampler_random_fract());
2560+
pos= (int) (targrows*sampler_random_fract(astate->rstate.randstate));
25612561
Assert(pos >=0&&pos<targrows);
25622562
heap_freetuple(astate->rows[pos]);
25632563
}

‎doc/src/sgml/catalogs.sgml

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,11 @@
278278
<entry>planner statistics</entry>
279279
</row>
280280

281+
<row>
282+
<entry><link linkend="catalog-pg-tablesample-method"><structname>pg_tablesample_method</structname></link></entry>
283+
<entry>table sampling methods</entry>
284+
</row>
285+
281286
<row>
282287
<entry><link linkend="catalog-pg-tablespace"><structname>pg_tablespace</structname></link></entry>
283288
<entry>tablespaces within this database cluster</entry>
@@ -6127,6 +6132,121 @@
61276132
</sect1>
61286133

61296134

6135+
<sect1 id="catalog-pg-tablesample-method">
6136+
<title><structname>pg_tabesample_method</structname></title>
6137+
6138+
<indexterm zone="catalog-pg-tablesample-method">
6139+
<primary>pg_am</primary>
6140+
</indexterm>
6141+
6142+
<para>
6143+
The catalog <structname>pg_tablesample_method</structname> stores
6144+
information about table sampling methods which can be used in
6145+
<command>TABLESAMPLE</command> clause of a <command>SELECT</command>
6146+
statement.
6147+
</para>
6148+
6149+
<table>
6150+
<title><structname>pg_tablesample_method</> Columns</title>
6151+
6152+
<tgroup cols="4">
6153+
<thead>
6154+
<row>
6155+
<entry>Name</entry>
6156+
<entry>Type</entry>
6157+
<entry>References</entry>
6158+
<entry>Description</entry>
6159+
</row>
6160+
</thead>
6161+
<tbody>
6162+
6163+
<row>
6164+
<entry><structfield>oid</structfield></entry>
6165+
<entry><type>oid</type></entry>
6166+
<entry></entry>
6167+
<entry>Row identifier (hidden attribute; must be explicitly selected)</entry>
6168+
</row>
6169+
6170+
<row>
6171+
<entry><structfield>tsmname</structfield></entry>
6172+
<entry><type>name</type></entry>
6173+
<entry></entry>
6174+
<entry>Name of the sampling method</entry>
6175+
</row>
6176+
6177+
<row>
6178+
<entry><structfield>tsmseqscan</structfield></entry>
6179+
<entry><type>bool</type></entry>
6180+
<entry></entry>
6181+
<entry>If true, the sampling method scans the whole table sequentially.
6182+
</entry>
6183+
</row>
6184+
6185+
<row>
6186+
<entry><structfield>tsmpagemode</structfield></entry>
6187+
<entry><type>bool</type></entry>
6188+
<entry></entry>
6189+
<entry>If true, the sampling method always reads the pages completely.
6190+
</entry>
6191+
</row>
6192+
6193+
<row>
6194+
<entry><structfield>tsminit</structfield></entry>
6195+
<entry><type>regproc</type></entry>
6196+
<entry><literal><link linkend="catalog-pg-proc"><structname>pg_proc</structname></link>.oid</literal></entry>
6197+
<entry><quote>Initialize the sampling scan</quote> function</entry>
6198+
</row>
6199+
6200+
<row>
6201+
<entry><structfield>tsmnextblock</structfield></entry>
6202+
<entry><type>regproc</type></entry>
6203+
<entry><literal><link linkend="catalog-pg-proc"><structname>pg_proc</structname></link>.oid</literal></entry>
6204+
<entry><quote>Get next block number</quote> function</entry>
6205+
</row>
6206+
6207+
<row>
6208+
<entry><structfield>tsmnexttuple</structfield></entry>
6209+
<entry><type>regproc</type></entry>
6210+
<entry><literal><link linkend="catalog-pg-proc"><structname>pg_proc</structname></link>.oid</literal></entry>
6211+
<entry><quote>Get next tuple offset</quote> function</entry>
6212+
</row>
6213+
6214+
<row>
6215+
<entry><structfield>tsmexaminetuple</structfield></entry>
6216+
<entry><type>regproc</type></entry>
6217+
<entry><literal><link linkend="catalog-pg-proc"><structname>pg_proc</structname></link>.oid</literal></entry>
6218+
<entry>Function which examines the tuple contents and decides if to
6219+
return it, or zero if none</entry>
6220+
</row>
6221+
6222+
<row>
6223+
<entry><structfield>tsmend</structfield></entry>
6224+
<entry><type>regproc</type></entry>
6225+
<entry><literal><link linkend="catalog-pg-proc"><structname>pg_proc</structname></link>.oid</literal></entry>
6226+
<entry><quote>End the sampling scan</quote> function</entry>
6227+
</row>
6228+
6229+
<row>
6230+
<entry><structfield>tsmreset</structfield></entry>
6231+
<entry><type>regproc</type></entry>
6232+
<entry><literal><link linkend="catalog-pg-proc"><structname>pg_proc</structname></link>.oid</literal></entry>
6233+
<entry><quote>Restart the state of sampling scan</quote> function</entry>
6234+
</row>
6235+
6236+
<row>
6237+
<entry><structfield>tsmcost</structfield></entry>
6238+
<entry><type>regproc</type></entry>
6239+
<entry><literal><link linkend="catalog-pg-proc"><structname>pg_proc</structname></link>.oid</literal></entry>
6240+
<entry>Costing function</entry>
6241+
</row>
6242+
6243+
</tbody>
6244+
</tgroup>
6245+
</table>
6246+
6247+
</sect1>
6248+
6249+
61306250
<sect1 id="catalog-pg-tablespace">
61316251
<title><structname>pg_tablespace</structname></title>
61326252

‎doc/src/sgml/ref/select.sgml

Lines changed: 60 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ SELECT [ ALL | DISTINCT [ ON ( <replaceable class="parameter">expression</replac
4949

5050
<phrase>where <replaceable class="parameter">from_item</replaceable> can be one of:</phrase>
5151

52-
[ ONLY ] <replaceable class="parameter">table_name</replaceable> [ * ] [ [ AS ] <replaceable class="parameter">alias</replaceable> [ ( <replaceable class="parameter">column_alias</replaceable> [, ...] ) ] ]
52+
[ ONLY ] <replaceable class="parameter">table_name</replaceable> [ * ] [ [ AS ] <replaceable class="parameter">alias</replaceable> [ ( <replaceable class="parameter">column_alias</replaceable> [, ...] ) ] ] [ TABLESAMPLE <replaceable class="parameter">sampling_method</replaceable> ( <replaceable class="parameter">argument</replaceable> [, ...] ) [ REPEATABLE ( <replaceable class="parameter">seed</replaceable> ) ] ]
5353
[ LATERAL ] ( <replaceable class="parameter">select</replaceable> ) [ AS ] <replaceable class="parameter">alias</replaceable> [ ( <replaceable class="parameter">column_alias</replaceable> [, ...] ) ]
5454
<replaceable class="parameter">with_query_name</replaceable> [ [ AS ] <replaceable class="parameter">alias</replaceable> [ ( <replaceable class="parameter">column_alias</replaceable> [, ...] ) ] ]
5555
[ LATERAL ] <replaceable class="parameter">function_name</replaceable> ( [ <replaceable class="parameter">argument</replaceable> [, ...] ] )
@@ -316,6 +316,50 @@ TABLE [ ONLY ] <replaceable class="parameter">table_name</replaceable> [ * ]
316316
</listitem>
317317
</varlistentry>
318318

319+
<varlistentry>
320+
<term>TABLESAMPLE <replaceable class="parameter">sampling_method</replaceable> ( <replaceable class="parameter">argument</replaceable> [, ...] ) [ REPEATABLE ( <replaceable class="parameter">seed</replaceable> ) ]</term>
321+
<listitem>
322+
<para>
323+
Table sample clause after
324+
<replaceable class="parameter">table_name</replaceable> indicates that
325+
a <replaceable class="parameter">sampling_method</replaceable> should
326+
be used to retrieve subset of rows in the table.
327+
The <replaceable class="parameter">sampling_method</replaceable> can be
328+
any sampling method installed in the database. There are currently two
329+
sampling methods available in the standard
330+
<productname>PostgreSQL</productname> distribution:
331+
<itemizedlist>
332+
<listitem>
333+
<para><literal>SYSTEM</literal></para>
334+
</listitem>
335+
<listitem>
336+
<para><literal>BERNOULLI</literal></para>
337+
</listitem>
338+
</itemizedlist>
339+
Both of these sampling methods currently accept only single argument
340+
which is the percent (floating point from 0 to 100) of the rows to
341+
be returned.
342+
The <literal>SYSTEM</literal> sampling method does block level
343+
sampling with each block having the same chance of being selected and
344+
returns all rows from each selected block.
345+
The <literal>BERNOULLI</literal> scans whole table and returns
346+
individual rows with equal probability. Additional sampling methods
347+
may be installed in the database via extensions.
348+
</para>
349+
<para>
350+
The optional parameter <literal>REPEATABLE</literal> uses the seed
351+
parameter, which can be a number or expression producing a number, as
352+
a random seed for sampling. Note that subsequent commands may return
353+
different results even if same <literal>REPEATABLE</literal> clause was
354+
specified. This happens because <acronym>DML</acronym> statements and
355+
maintenance operations such as <command>VACUUM</> may affect physical
356+
distribution of data. The <function>setseed()</> function will not
357+
affect the sampling result when the <literal>REPEATABLE</literal>
358+
parameter is used.
359+
</para>
360+
</listitem>
361+
</varlistentry>
362+
319363
<varlistentry>
320364
<term><replaceable class="parameter">alias</replaceable></term>
321365
<listitem>
@@ -1927,5 +1971,20 @@ SELECT distributors.* WHERE distributors.name = 'Westward';
19271971
<literal>ROWS FROM( ... )</> is an extension of the SQL standard.
19281972
</para>
19291973
</refsect2>
1974+
1975+
<refsect2>
1976+
<title><literal>TABLESAMPLE</literal> clause</title>
1977+
1978+
<para>
1979+
The <literal>TABLESAMPLE</> clause is currently accepted only on physical
1980+
relations and materialized views.
1981+
</para>
1982+
1983+
<para>
1984+
Additional modules allow you to install custom sampling methods and use
1985+
them instead of the SQL standard methods.
1986+
</para>
1987+
</refsect2>
1988+
19301989
</refsect1>
19311990
</refentry>

‎src/backend/access/Makefile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ subdir = src/backend/access
88
top_builddir = ../../..
99
include$(top_builddir)/src/Makefile.global
1010

11-
SUBDIRS = brin common gin gist hash heap index nbtree rmgrdesc spgist transam
11+
SUBDIRS = brin common gin gist hash heap index nbtree rmgrdesc spgist\
12+
tablesample transam
1213

1314
include$(top_srcdir)/src/backend/common.mk

‎src/backend/access/heap/heapam.c

Lines changed: 29 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -80,8 +80,9 @@ boolsynchronize_seqscans = true;
8080
staticHeapScanDescheap_beginscan_internal(Relationrelation,
8181
Snapshotsnapshot,
8282
intnkeys,ScanKeykey,
83-
boolallow_strat,boolallow_sync,
84-
boolis_bitmapscan,booltemp_snap);
83+
boolallow_strat,boolallow_sync,boolallow_pagemode,
84+
boolis_bitmapscan,boolis_samplescan,
85+
booltemp_snap);
8586
staticHeapTupleheap_prepare_insert(Relationrelation,HeapTupletup,
8687
TransactionIdxid,CommandIdcid,intoptions);
8788
staticXLogRecPtrlog_heap_update(Relationreln,Bufferoldbuf,
@@ -294,9 +295,10 @@ initscan(HeapScanDesc scan, ScanKey key, bool is_rescan)
294295

295296
/*
296297
* Currently, we don't have a stats counter for bitmap heap scans (but the
297-
* underlying bitmap index scans will be counted).
298+
* underlying bitmap index scans will be counted) or sample scans (we only
299+
* update stats for tuple fetches there)
298300
*/
299-
if (!scan->rs_bitmapscan)
301+
if (!scan->rs_bitmapscan&& !scan->rs_samplescan)
300302
pgstat_count_heap_scan(scan->rs_rd);
301303
}
302304

@@ -315,7 +317,7 @@ heap_setscanlimits(HeapScanDesc scan, BlockNumber startBlk, BlockNumber numBlks)
315317
* In page-at-a-time mode it performs additional work, namely determining
316318
* which tuples on the page are visible.
317319
*/
318-
staticvoid
320+
void
319321
heapgetpage(HeapScanDescscan,BlockNumberpage)
320322
{
321323
Bufferbuffer;
@@ -1310,14 +1312,17 @@ heap_openrv_extended(const RangeVar *relation, LOCKMODE lockmode,
13101312
* HeapScanDesc for a bitmap heap scan. Although that scan technology is
13111313
* really quite unlike a standard seqscan, there is just enough commonality
13121314
* to make it worth using the same data structure.
1315+
*
1316+
* heap_beginscan_samplingscan is alternate entry point for setting up a
1317+
* HeapScanDesc for a TABLESAMPLE scan.
13131318
* ----------------
13141319
*/
13151320
HeapScanDesc
13161321
heap_beginscan(Relationrelation,Snapshotsnapshot,
13171322
intnkeys,ScanKeykey)
13181323
{
13191324
returnheap_beginscan_internal(relation,snapshot,nkeys,key,
1320-
true, true, false, false);
1325+
true, true,true, false,false, false);
13211326
}
13221327

13231328
HeapScanDesc
@@ -1327,7 +1332,7 @@ heap_beginscan_catalog(Relation relation, int nkeys, ScanKey key)
13271332
Snapshotsnapshot=RegisterSnapshot(GetCatalogSnapshot(relid));
13281333

13291334
returnheap_beginscan_internal(relation,snapshot,nkeys,key,
1330-
true, true, false, true);
1335+
true, true,true, false,false, true);
13311336
}
13321337

13331338
HeapScanDesc
@@ -1336,22 +1341,33 @@ heap_beginscan_strat(Relation relation, Snapshot snapshot,
13361341
boolallow_strat,boolallow_sync)
13371342
{
13381343
returnheap_beginscan_internal(relation,snapshot,nkeys,key,
1339-
allow_strat,allow_sync, false, false);
1344+
allow_strat,allow_sync, true,
1345+
false, false, false);
13401346
}
13411347

13421348
HeapScanDesc
13431349
heap_beginscan_bm(Relationrelation,Snapshotsnapshot,
13441350
intnkeys,ScanKeykey)
13451351
{
13461352
returnheap_beginscan_internal(relation,snapshot,nkeys,key,
1347-
false, false, true, false);
1353+
false, false, true, true, false, false);
1354+
}
1355+
1356+
HeapScanDesc
1357+
heap_beginscan_sampling(Relationrelation,Snapshotsnapshot,
1358+
intnkeys,ScanKeykey,
1359+
boolallow_strat,boolallow_pagemode)
1360+
{
1361+
returnheap_beginscan_internal(relation,snapshot,nkeys,key,
1362+
allow_strat, false,allow_pagemode,
1363+
false, true, false);
13481364
}
13491365

13501366
staticHeapScanDesc
13511367
heap_beginscan_internal(Relationrelation,Snapshotsnapshot,
13521368
intnkeys,ScanKeykey,
1353-
boolallow_strat,boolallow_sync,
1354-
boolis_bitmapscan,booltemp_snap)
1369+
boolallow_strat,boolallow_sync,boolallow_pagemode,
1370+
boolis_bitmapscan,boolis_samplescan,booltemp_snap)
13551371
{
13561372
HeapScanDescscan;
13571373

@@ -1373,6 +1389,7 @@ heap_beginscan_internal(Relation relation, Snapshot snapshot,
13731389
scan->rs_snapshot=snapshot;
13741390
scan->rs_nkeys=nkeys;
13751391
scan->rs_bitmapscan=is_bitmapscan;
1392+
scan->rs_samplescan=is_samplescan;
13761393
scan->rs_strategy=NULL;/* set in initscan */
13771394
scan->rs_allow_strat=allow_strat;
13781395
scan->rs_allow_sync=allow_sync;
@@ -1381,7 +1398,7 @@ heap_beginscan_internal(Relation relation, Snapshot snapshot,
13811398
/*
13821399
* we can use page-at-a-time mode if it's an MVCC-safe snapshot
13831400
*/
1384-
scan->rs_pageatatime=IsMVCCSnapshot(snapshot);
1401+
scan->rs_pageatatime=allow_pagemode&&IsMVCCSnapshot(snapshot);
13851402

13861403
/*
13871404
* For a seqscan in a serializable transaction, acquire a predicate lock
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#-------------------------------------------------------------------------
2+
#
3+
# Makefile--
4+
# Makefile for utils/tablesample
5+
#
6+
# IDENTIFICATION
7+
# src/backend/utils/tablesample/Makefile
8+
#
9+
#-------------------------------------------------------------------------
10+
11+
subdir = src/backend/access/tablesample
12+
top_builddir = ../../../..
13+
include$(top_builddir)/src/Makefile.global
14+
15+
OBJS = tablesample.o system.o bernoulli.o
16+
17+
include$(top_srcdir)/src/backend/common.mk

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp