77 * Copyright (c) 2001-2006, PostgreSQL Global Development Group
88 *
99 * IDENTIFICATION
10- * $PostgreSQL: pgsql/src/backend/executor/instrument.c,v 1.15 2006/05/3014:01:58 momjian Exp $
10+ * $PostgreSQL: pgsql/src/backend/executor/instrument.c,v 1.16 2006/05/3019:24:25 tgl Exp $
1111 *
1212 *-------------------------------------------------------------------------
1313 */
1414#include "postgres.h"
1515
16- #include <unistd.h>
1716#include <math.h>
17+ #include <unistd.h>
1818
1919#include "executor/instrument.h"
2020
21- /* This is the function that is used to determine the sampling intervals. In
22- * general, if the function is f(x), then for N tuples we will take on the
23- * order of integral(1/f(x), x=0..N) samples. Some examples follow, with the
24- * number of samples that would be collected over 1,000,000 tuples.
2521
26- f(x) = x => log2(N) 20
27- f(x) = x^(1/2) => 2 * N^(1/2) 2000
28- f(x) = x^(1/3) => 1.5 * N^(2/3) 15000
29-
22+ /*
23+ * As of PostgreSQL 8.2, we try to reduce the overhead of EXPLAIN ANALYZE
24+ * by not calling INSTR_TIME_SET_CURRENT() for every single node execution.
25+ * (Because that requires a kernel call on most systems, it's expensive.)
26+ *
27+ * This macro determines the sampling interval: that is, after how many more
28+ * iterations will we take the next time sample, given that niters iterations
29+ * have occurred already. In general, if the function is f(x), then for N
30+ * iterations we will take on the order of integral(1/f(x), x=0..N)
31+ * samples. Some examples follow, with the number of samples that would be
32+ * collected over 1,000,000 iterations:
33+ *
34+ *f(x) = x => log2(N) 20
35+ *f(x) = x^(1/2) => 2 * N^(1/2) 2000
36+ *f(x) = x^(1/3) => 1.5 * N^(2/3) 15000
37+ *
3038 * I've chosen the last one as it seems to provide a good compromise between
3139 * low overhead but still getting a meaningful number of samples. However,
3240 * not all machines have the cbrt() function so on those we substitute
3341 * sqrt(). The difference is not very significant in the tests I made.
34- */
42+ *
43+ * The actual sampling interval is randomized with the SampleFunc() value
44+ * as the mean; this hopefully will reduce any measurement bias due to
45+ * cyclic variation in the node execution time.
46+ */
3547#ifdef HAVE_CBRT
36- #define SampleFunc cbrt
48+ #define SampleFunc ( niters ) cbrt(niters)
3749#else
38- #define SampleFunc sqrt
50+ #define SampleFunc ( niters ) sqrt(niters)
3951#endif
4052
53+ #define SampleInterval (niters ) \
54+ (SampleFunc(niters) * (double) random() / (double) (MAX_RANDOM_VALUE/2))
55+
56+ /*
57+ * We sample at every node iteration until we've reached this threshold,
58+ * so that nodes not called a large number of times are completely accurate.
59+ * (Perhaps this should be a GUC variable? But beware of messing up
60+ * CalculateSampleOverhead if value is too small.)
61+ */
4162#define SAMPLE_THRESHOLD 50
4263
64+ /*
65+ * Determine the sampling overhead. This only needs to be done once per
66+ * backend (actually, probably once per postmaster would do ...)
67+ */
4368static double SampleOverhead ;
44- static bool SampleOverheadCalculated ;
69+ static bool SampleOverheadCalculated = false ;
4570
4671static void
47- CalculateSampleOverhead ()
72+ CalculateSampleOverhead (void )
4873{
4974Instrumentation instr ;
5075int i ;
51-
52- /* We want to determine the sampling overhead, to correct
53- * calculations later. This only needs to be done once per backend.
54- * Is this the place? A wrong value here (due to a mistimed
55- * task-switch) will cause bad calculations later.
56- *
76+
77+ /*
78+ * We could get a wrong result due to being interrupted by task switch.
5779 * To minimize the risk we do it a few times and take the lowest.
5880 */
59-
6081SampleOverhead = 1.0e6 ;
61-
62- for ( i = 0 ;i < 5 ;i ++ )
82+
83+ for ( i = 0 ;i < 5 ;i ++ )
6384{
6485int j ;
6586double overhead ;
66-
67- memset (& instr ,0 ,sizeof (instr ) );
68-
69- /* Loop SAMPLE_THRESHOLD times or 100 microseconds, whichever is faster */
70- for (j = 0 ;j < SAMPLE_THRESHOLD && INSTR_TIME_GET_DOUBLE (instr .counter )< 100e-6 ;i ++ )
87+
88+ memset (& instr ,0 ,sizeof (instr ));
89+ /*
90+ * We know that samples will actually be taken up to SAMPLE_THRESHOLD,
91+ * so that's as far as we can test.
92+ */
93+ for (j = 0 ;j < SAMPLE_THRESHOLD ;j ++ )
7194{
72- InstrStartNode (& instr );
73- InstrStopNode (& instr ,1 );
95+ InstrStartNode (& instr );
96+ InstrStopNode (& instr ,1 );
7497}
7598overhead = INSTR_TIME_GET_DOUBLE (instr .counter ) /instr .samplecount ;
76- if ( overhead < SampleOverhead )
99+ if ( overhead < SampleOverhead )
77100SampleOverhead = overhead ;
78101}
79-
102+
80103SampleOverheadCalculated = true;
81104}
82105
106+
83107/* Allocate new instrumentation structure(s) */
84108Instrumentation *
85109InstrAlloc (int n )
86110{
87- Instrumentation * instr = palloc0 ( n * sizeof ( Instrumentation )) ;
111+ Instrumentation * instr ;
88112
89- /* we don't need to do any initialization except zero 'em */
90-
91- /* Calculate overhead, if not done yet */
92- if ( !SampleOverheadCalculated )
113+ /* Calculate sampling overhead, if not done yet in this backend */
114+ if (!SampleOverheadCalculated )
93115CalculateSampleOverhead ();
116+
117+ instr = palloc0 (n * sizeof (Instrumentation ));
118+
119+ /* we don't need to do any initialization except zero 'em */
120+
94121return instr ;
95122}
96123
@@ -100,18 +127,17 @@ InstrStartNode(Instrumentation *instr)
100127{
101128if (INSTR_TIME_IS_ZERO (instr -> starttime ))
102129{
103- /* We always sample the first SAMPLE_THRESHOLD tuples, so small nodes are always accurate */
104- if (instr -> tuplecount < SAMPLE_THRESHOLD )
130+ /*
131+ * Always sample if not yet up to threshold, else check whether
132+ * next threshold has been reached
133+ */
134+ if (instr -> itercount < SAMPLE_THRESHOLD )
105135instr -> sampling = true;
106- else
136+ else if ( instr -> itercount >= instr -> nextsample )
107137{
108- /* Otherwise we go to sampling, see the comments on SampleFunc at the top of the file */
109- if (instr -> tuplecount > instr -> nextsample )
110- {
111- instr -> sampling = true;
112- /* The doubling is so the random will average 1 over time */
113- instr -> nextsample += 2.0 * SampleFunc (instr -> tuplecount )* (double )rand () / (double )RAND_MAX ;
114- }
138+ instr -> sampling = true;
139+ instr -> nextsample =
140+ instr -> itercount + SampleInterval (instr -> itercount );
115141}
116142if (instr -> sampling )
117143INSTR_TIME_SET_CURRENT (instr -> starttime );
@@ -124,13 +150,15 @@ InstrStartNode(Instrumentation *instr)
124150void
125151InstrStopNode (Instrumentation * instr ,double nTuples )
126152{
127- instr_time endtime ;
128-
129- /* count the returned tuples */
153+ /* count the returned tuples and iterations */
130154instr -> tuplecount += nTuples ;
155+ instr -> itercount += 1 ;
131156
157+ /* measure runtime if appropriate */
132158if (instr -> sampling )
133159{
160+ instr_time endtime ;
161+
134162if (INSTR_TIME_IS_ZERO (instr -> starttime ))
135163{
136164elog (DEBUG2 ,"InstrStopNode called without start" );
@@ -159,7 +187,8 @@ InstrStopNode(Instrumentation *instr, double nTuples)
159187#endif
160188
161189INSTR_TIME_SET_ZERO (instr -> starttime );
162- instr -> samplecount += nTuples ;
190+
191+ instr -> samplecount += 1 ;
163192instr -> sampling = false;
164193}
165194
@@ -184,35 +213,44 @@ InstrEndLoop(Instrumentation *instr)
184213if (!INSTR_TIME_IS_ZERO (instr -> starttime ))
185214elog (DEBUG2 ,"InstrEndLoop called on running node" );
186215
187- /*Accumulate per-cycle statistics into totals */
216+ /*Compute time spent in node */
188217totaltime = INSTR_TIME_GET_DOUBLE (instr -> counter );
189218
190- instr -> startup += instr -> firsttuple ;
191-
192- /* Here we take into account sampling effects. Doing it naively ends
193- * up assuming the sampling overhead applies to all tuples, even the
194- * ones we didn't measure. We've calculated an overhead, so we
195- * subtract that for all samples we didn't measure. The first tuple
196- * is also special cased, because it usually takes longer. */
197-
198- if (instr -> samplecount < instr -> tuplecount )
219+ /*
220+ * If we didn't measure runtime on every iteration, then we must increase
221+ * the measured total to account for the other iterations. Naively
222+ * multiplying totaltime by itercount/samplecount would be wrong because
223+ * it effectively assumes the sampling overhead applies to all iterations,
224+ * even the ones we didn't measure. (Note that what we are trying to
225+ * estimate here is the actual time spent in the node, including the
226+ * actual measurement overhead; not the time exclusive of measurement
227+ * overhead.) We exclude the first iteration from the correction basis,
228+ * because it often takes longer than others.
229+ */
230+ if (instr -> itercount > instr -> samplecount )
199231{
200- double pertuple = (totaltime - instr -> firsttuple ) / (instr -> samplecount - 1 );
201- instr -> total += instr -> firsttuple + (pertuple * (instr -> samplecount - 1 ))
202- + ((pertuple - SampleOverhead )* (instr -> tuplecount - instr -> samplecount ));
232+ double per_iter ;
233+
234+ per_iter = (totaltime - instr -> firsttuple ) / (instr -> samplecount - 1 )
235+ - SampleOverhead ;
236+ if (per_iter > 0 )/* sanity check */
237+ totaltime += per_iter * (instr -> itercount - instr -> samplecount );
203238}
204- else
205- instr -> total += totaltime ;
206-
239+
240+ /* Accumulate per-cycle statistics into totals */
241+ instr -> startup += instr -> firsttuple ;
242+ instr -> total += totaltime ;
207243instr -> ntuples += instr -> tuplecount ;
208- instr -> nsamples += instr -> samplecount ;
209244instr -> nloops += 1 ;
210245
211246/* Reset for next cycle (if any) */
212247instr -> running = false;
248+ instr -> sampling = false;
213249INSTR_TIME_SET_ZERO (instr -> starttime );
214250INSTR_TIME_SET_ZERO (instr -> counter );
215251instr -> firsttuple = 0 ;
216- instr -> samplecount = 0 ;
217252instr -> tuplecount = 0 ;
253+ instr -> itercount = 0 ;
254+ instr -> samplecount = 0 ;
255+ instr -> nextsample = 0 ;
218256}