Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit83e176e

Browse files
Separate block sampling functions
Refactoring ahead of tablesample patchRequested and reviewed by Michael PaquierPetr Jelinek
1 parent5a3022f commit83e176e

File tree

7 files changed

+287
-232
lines changed

7 files changed

+287
-232
lines changed

‎contrib/file_fdw/file_fdw.c

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
#include"optimizer/var.h"
3535
#include"utils/memutils.h"
3636
#include"utils/rel.h"
37+
#include"utils/sampling.h"
3738

3839
PG_MODULE_MAGIC;
3940

@@ -1006,7 +1007,7 @@ file_acquire_sample_rows(Relation onerel, int elevel,
10061007
{
10071008
intnumrows=0;
10081009
doublerowstoskip=-1;/* -1 means not set yet */
1009-
doublerstate;
1010+
ReservoirStateDatarstate;
10101011
TupleDesctupDesc;
10111012
Datum*values;
10121013
bool*nulls;
@@ -1044,7 +1045,7 @@ file_acquire_sample_rows(Relation onerel, int elevel,
10441045
ALLOCSET_DEFAULT_MAXSIZE);
10451046

10461047
/* Prepare for sampling rows */
1047-
rstate=anl_init_selection_state(targrows);
1048+
reservoir_init_selection_state(&rstate,targrows);
10481049

10491050
/* Set up callback to identify error line number. */
10501051
errcallback.callback=CopyFromErrorCallback;
@@ -1088,15 +1089,15 @@ file_acquire_sample_rows(Relation onerel, int elevel,
10881089
* not-yet-incremented value of totalrows as t.
10891090
*/
10901091
if (rowstoskip<0)
1091-
rowstoskip=anl_get_next_S(*totalrows,targrows,&rstate);
1092+
rowstoskip=reservoir_get_next_S(&rstate,*totalrows,targrows);
10921093

10931094
if (rowstoskip <=0)
10941095
{
10951096
/*
10961097
* Found a suitable tuple, so save it, replacing one old tuple
10971098
* at random
10981099
*/
1099-
intk= (int) (targrows*anl_random_fract());
1100+
intk= (int) (targrows*sampler_random_fract());
11001101

11011102
Assert(k >=0&&k<targrows);
11021103
heap_freetuple(rows[k]);

‎contrib/postgres_fdw/postgres_fdw.c

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
#include"utils/lsyscache.h"
3838
#include"utils/memutils.h"
3939
#include"utils/rel.h"
40+
#include"utils/sampling.h"
4041

4142
PG_MODULE_MAGIC;
4243

@@ -202,7 +203,7 @@ typedef struct PgFdwAnalyzeState
202203
/* for random sampling */
203204
doublesamplerows;/* # of rows fetched */
204205
doublerowstoskip;/* # of rows to skip before next sample */
205-
doublerstate;/*randomstate */
206+
ReservoirStateDatarstate;/* statefor reservoir sampling*/
206207

207208
/* working memory contexts */
208209
MemoryContextanl_cxt;/* context for per-analyze lifespan data */
@@ -2411,7 +2412,7 @@ postgresAcquireSampleRowsFunc(Relation relation, int elevel,
24112412
astate.numrows=0;
24122413
astate.samplerows=0;
24132414
astate.rowstoskip=-1;/* -1 means not set yet */
2414-
astate.rstate=anl_init_selection_state(targrows);
2415+
reservoir_init_selection_state(&astate.rstate,targrows);
24152416

24162417
/* Remember ANALYZE context, and create a per-tuple temp context */
24172418
astate.anl_cxt=CurrentMemoryContext;
@@ -2551,13 +2552,12 @@ analyze_row_processor(PGresult *res, int row, PgFdwAnalyzeState *astate)
25512552
* analyze.c; see Jeff Vitter's paper.
25522553
*/
25532554
if (astate->rowstoskip<0)
2554-
astate->rowstoskip=anl_get_next_S(astate->samplerows,targrows,
2555-
&astate->rstate);
2555+
astate->rowstoskip=reservoir_get_next_S(&astate->rstate,astate->samplerows,targrows);
25562556

25572557
if (astate->rowstoskip <=0)
25582558
{
25592559
/* Choose a random reservoir element to replace. */
2560-
pos= (int) (targrows*anl_random_fract());
2560+
pos= (int) (targrows*sampler_random_fract());
25612561
Assert(pos >=0&&pos<targrows);
25622562
heap_freetuple(astate->rows[pos]);
25632563
}

‎src/backend/commands/analyze.c

Lines changed: 6 additions & 219 deletions
Original file line numberDiff line numberDiff line change
@@ -50,23 +50,13 @@
5050
#include"utils/lsyscache.h"
5151
#include"utils/memutils.h"
5252
#include"utils/pg_rusage.h"
53+
#include"utils/sampling.h"
5354
#include"utils/sortsupport.h"
5455
#include"utils/syscache.h"
5556
#include"utils/timestamp.h"
5657
#include"utils/tqual.h"
5758

5859

59-
/* Data structure for Algorithm S from Knuth 3.4.2 */
60-
typedefstruct
61-
{
62-
BlockNumberN;/* number of blocks, known in advance */
63-
intn;/* desired sample size */
64-
BlockNumbert;/* current block number */
65-
intm;/* blocks selected so far */
66-
}BlockSamplerData;
67-
68-
typedefBlockSamplerData*BlockSampler;
69-
7060
/* Per-index data for ANALYZE */
7161
typedefstructAnlIndexData
7262
{
@@ -89,10 +79,6 @@ static void do_analyze_rel(Relation onerel, int options,
8979
VacuumParams*params,List*va_cols,
9080
AcquireSampleRowsFuncacquirefunc,BlockNumberrelpages,
9181
boolinh,boolin_outer_xact,intelevel);
92-
staticvoidBlockSampler_Init(BlockSamplerbs,BlockNumbernblocks,
93-
intsamplesize);
94-
staticboolBlockSampler_HasMore(BlockSamplerbs);
95-
staticBlockNumberBlockSampler_Next(BlockSamplerbs);
9682
staticvoidcompute_index_stats(Relationonerel,doubletotalrows,
9783
AnlIndexData*indexdata,intnindexes,
9884
HeapTuple*rows,intnumrows,
@@ -950,94 +936,6 @@ examine_attribute(Relation onerel, int attnum, Node *index_expr)
950936
returnstats;
951937
}
952938

953-
/*
954-
* BlockSampler_Init -- prepare for random sampling of blocknumbers
955-
*
956-
* BlockSampler is used for stage one of our new two-stage tuple
957-
* sampling mechanism as discussed on pgsql-hackers 2004-04-02 (subject
958-
* "Large DB"). It selects a random sample of samplesize blocks out of
959-
* the nblocks blocks in the table. If the table has less than
960-
* samplesize blocks, all blocks are selected.
961-
*
962-
* Since we know the total number of blocks in advance, we can use the
963-
* straightforward Algorithm S from Knuth 3.4.2, rather than Vitter's
964-
* algorithm.
965-
*/
966-
staticvoid
967-
BlockSampler_Init(BlockSamplerbs,BlockNumbernblocks,intsamplesize)
968-
{
969-
bs->N=nblocks;/* measured table size */
970-
971-
/*
972-
* If we decide to reduce samplesize for tables that have less or not much
973-
* more than samplesize blocks, here is the place to do it.
974-
*/
975-
bs->n=samplesize;
976-
bs->t=0;/* blocks scanned so far */
977-
bs->m=0;/* blocks selected so far */
978-
}
979-
980-
staticbool
981-
BlockSampler_HasMore(BlockSamplerbs)
982-
{
983-
return (bs->t<bs->N)&& (bs->m<bs->n);
984-
}
985-
986-
staticBlockNumber
987-
BlockSampler_Next(BlockSamplerbs)
988-
{
989-
BlockNumberK=bs->N-bs->t;/* remaining blocks */
990-
intk=bs->n-bs->m;/* blocks still to sample */
991-
doublep;/* probability to skip block */
992-
doubleV;/* random */
993-
994-
Assert(BlockSampler_HasMore(bs));/* hence K > 0 and k > 0 */
995-
996-
if ((BlockNumber)k >=K)
997-
{
998-
/* need all the rest */
999-
bs->m++;
1000-
returnbs->t++;
1001-
}
1002-
1003-
/*----------
1004-
* It is not obvious that this code matches Knuth's Algorithm S.
1005-
* Knuth says to skip the current block with probability 1 - k/K.
1006-
* If we are to skip, we should advance t (hence decrease K), and
1007-
* repeat the same probabilistic test for the next block. The naive
1008-
* implementation thus requires an anl_random_fract() call for each block
1009-
* number. But we can reduce this to one anl_random_fract() call per
1010-
* selected block, by noting that each time the while-test succeeds,
1011-
* we can reinterpret V as a uniform random number in the range 0 to p.
1012-
* Therefore, instead of choosing a new V, we just adjust p to be
1013-
* the appropriate fraction of its former value, and our next loop
1014-
* makes the appropriate probabilistic test.
1015-
*
1016-
* We have initially K > k > 0. If the loop reduces K to equal k,
1017-
* the next while-test must fail since p will become exactly zero
1018-
* (we assume there will not be roundoff error in the division).
1019-
* (Note: Knuth suggests a "<=" loop condition, but we use "<" just
1020-
* to be doubly sure about roundoff error.) Therefore K cannot become
1021-
* less than k, which means that we cannot fail to select enough blocks.
1022-
*----------
1023-
*/
1024-
V=anl_random_fract();
1025-
p=1.0- (double)k / (double)K;
1026-
while (V<p)
1027-
{
1028-
/* skip */
1029-
bs->t++;
1030-
K--;/* keep K == N - t */
1031-
1032-
/* adjust p to be new cutoff point in reduced range */
1033-
p *=1.0- (double)k / (double)K;
1034-
}
1035-
1036-
/* select */
1037-
bs->m++;
1038-
returnbs->t++;
1039-
}
1040-
1041939
/*
1042940
* acquire_sample_rows -- acquire a random sample of rows from the table
1043941
*
@@ -1084,7 +982,7 @@ acquire_sample_rows(Relation onerel, int elevel,
1084982
BlockNumbertotalblocks;
1085983
TransactionIdOldestXmin;
1086984
BlockSamplerDatabs;
1087-
doublerstate;
985+
ReservoirStateDatarstate;
1088986

1089987
Assert(targrows>0);
1090988

@@ -1094,9 +992,9 @@ acquire_sample_rows(Relation onerel, int elevel,
1094992
OldestXmin=GetOldestXmin(onerel, true);
1095993

1096994
/* Prepare for sampling block numbers */
1097-
BlockSampler_Init(&bs,totalblocks,targrows);
995+
BlockSampler_Init(&bs,totalblocks,targrows,random());
1098996
/* Prepare for sampling rows */
1099-
rstate=anl_init_selection_state(targrows);
997+
reservoir_init_selection_state(&rstate,targrows);
1100998

1101999
/* Outer loop over blocks to sample */
11021000
while (BlockSampler_HasMore(&bs))
@@ -1244,16 +1142,15 @@ acquire_sample_rows(Relation onerel, int elevel,
12441142
* t.
12451143
*/
12461144
if (rowstoskip<0)
1247-
rowstoskip=anl_get_next_S(samplerows,targrows,
1248-
&rstate);
1145+
rowstoskip=reservoir_get_next_S(&rstate,samplerows,targrows);
12491146

12501147
if (rowstoskip <=0)
12511148
{
12521149
/*
12531150
* Found a suitable tuple, so save it, replacing one
12541151
* old tuple at random
12551152
*/
1256-
intk= (int) (targrows*anl_random_fract());
1153+
intk= (int) (targrows*sampler_random_fract());
12571154

12581155
Assert(k >=0&&k<targrows);
12591156
heap_freetuple(rows[k]);
@@ -1312,116 +1209,6 @@ acquire_sample_rows(Relation onerel, int elevel,
13121209
returnnumrows;
13131210
}
13141211

1315-
/* Select a random value R uniformly distributed in (0 - 1) */
1316-
double
1317-
anl_random_fract(void)
1318-
{
1319-
return ((double)random()+1) / ((double)MAX_RANDOM_VALUE+2);
1320-
}
1321-
1322-
/*
1323-
* These two routines embody Algorithm Z from "Random sampling with a
1324-
* reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1
1325-
* (Mar. 1985), Pages 37-57. Vitter describes his algorithm in terms
1326-
* of the count S of records to skip before processing another record.
1327-
* It is computed primarily based on t, the number of records already read.
1328-
* The only extra state needed between calls is W, a random state variable.
1329-
*
1330-
* anl_init_selection_state computes the initial W value.
1331-
*
1332-
* Given that we've already read t records (t >= n), anl_get_next_S
1333-
* determines the number of records to skip before the next record is
1334-
* processed.
1335-
*/
1336-
double
1337-
anl_init_selection_state(intn)
1338-
{
1339-
/* Initial value of W (for use when Algorithm Z is first applied) */
1340-
returnexp(-log(anl_random_fract()) /n);
1341-
}
1342-
1343-
double
1344-
anl_get_next_S(doublet,intn,double*stateptr)
1345-
{
1346-
doubleS;
1347-
1348-
/* The magic constant here is T from Vitter's paper */
1349-
if (t <= (22.0*n))
1350-
{
1351-
/* Process records using Algorithm X until t is large enough */
1352-
doubleV,
1353-
quot;
1354-
1355-
V=anl_random_fract();/* Generate V */
1356-
S=0;
1357-
t+=1;
1358-
/* Note: "num" in Vitter's code is always equal to t - n */
1359-
quot= (t- (double)n) /t;
1360-
/* Find min S satisfying (4.1) */
1361-
while (quot>V)
1362-
{
1363-
S+=1;
1364-
t+=1;
1365-
quot *= (t- (double)n) /t;
1366-
}
1367-
}
1368-
else
1369-
{
1370-
/* Now apply Algorithm Z */
1371-
doubleW=*stateptr;
1372-
doubleterm=t- (double)n+1;
1373-
1374-
for (;;)
1375-
{
1376-
doublenumer,
1377-
numer_lim,
1378-
denom;
1379-
doubleU,
1380-
X,
1381-
lhs,
1382-
rhs,
1383-
y,
1384-
tmp;
1385-
1386-
/* Generate U and X */
1387-
U=anl_random_fract();
1388-
X=t* (W-1.0);
1389-
S=floor(X);/* S is tentatively set to floor(X) */
1390-
/* Test if U <= h(S)/cg(X) in the manner of (6.3) */
1391-
tmp= (t+1) /term;
1392-
lhs=exp(log(((U*tmp*tmp)* (term+S)) / (t+X)) /n);
1393-
rhs= (((t+X) / (term+S))*term) /t;
1394-
if (lhs <=rhs)
1395-
{
1396-
W=rhs /lhs;
1397-
break;
1398-
}
1399-
/* Test if U <= f(S)/cg(X) */
1400-
y= (((U* (t+1)) /term)* (t+S+1)) / (t+X);
1401-
if ((double)n<S)
1402-
{
1403-
denom=t;
1404-
numer_lim=term+S;
1405-
}
1406-
else
1407-
{
1408-
denom=t- (double)n+S;
1409-
numer_lim=t+1;
1410-
}
1411-
for (numer=t+S;numer >=numer_lim;numer-=1)
1412-
{
1413-
y *=numer /denom;
1414-
denom-=1;
1415-
}
1416-
W=exp(-log(anl_random_fract()) /n);/* Generate W in advance */
1417-
if (exp(log(y) /n) <= (t+X) /t)
1418-
break;
1419-
}
1420-
*stateptr=W;
1421-
}
1422-
returnS;
1423-
}
1424-
14251212
/*
14261213
* qsort comparator for sorting rows[] array
14271214
*/

‎src/backend/utils/misc/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ include $(top_builddir)/src/Makefile.global
1515
overrideCPPFLAGS := -I. -I$(srcdir)$(CPPFLAGS)
1616

1717
OBJS = guc.o help_config.o pg_rusage.o ps_status.o rls.o\
18-
superuser.o timeout.o tzparser.o
18+
sampling.osuperuser.o timeout.o tzparser.o
1919

2020
# This location might depend on the installation directories. Therefore
2121
# we can't subsitute it into pg_config.h.

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp