77 * Copyright (c) 2001-2006, PostgreSQL Global Development Group
88 *
99 * IDENTIFICATION
10- * $PostgreSQL: pgsql/src/backend/executor/instrument.c,v 1.14 2006/03/05 15:58:26 momjian Exp $
10+ * $PostgreSQL: pgsql/src/backend/executor/instrument.c,v 1.15 2006/05/30 14:01:58 momjian Exp $
1111 *
1212 *-------------------------------------------------------------------------
1313 */
1414#include "postgres.h"
1515
1616#include <unistd.h>
17+ #include <math.h>
1718
1819#include "executor/instrument.h"
1920
21+ /* This is the function that is used to determine the sampling intervals. In
22+ * general, if the function is f(x), then for N tuples we will take on the
23+ * order of integral(1/f(x), x=0..N) samples. Some examples follow, with the
24+ * number of samples that would be collected over 1,000,000 tuples.
25+
26+ f(x) = x => log2(N) 20
27+ f(x) = x^(1/2) => 2 * N^(1/2) 2000
28+ f(x) = x^(1/3) => 1.5 * N^(2/3) 15000
29+
30+ * I've chosen the last one as it seems to provide a good compromise between
31+ * low overhead but still getting a meaningful number of samples. However,
32+ * not all machines have the cbrt() function so on those we substitute
33+ * sqrt(). The difference is not very significant in the tests I made.
34+ */
35+ #ifdef HAVE_CBRT
36+ #define SampleFunc cbrt
37+ #else
38+ #define SampleFunc sqrt
39+ #endif
40+
41+ #define SAMPLE_THRESHOLD 50
42+
43+ static double SampleOverhead ;
44+ static bool SampleOverheadCalculated ;
45+
46+ static void
47+ CalculateSampleOverhead ()
48+ {
49+ Instrumentation instr ;
50+ int i ;
51+
52+ /* We want to determine the sampling overhead, to correct
53+ * calculations later. This only needs to be done once per backend.
54+ * Is this the place? A wrong value here (due to a mistimed
55+ * task-switch) will cause bad calculations later.
56+ *
57+ * To minimize the risk we do it a few times and take the lowest.
58+ */
59+
60+ SampleOverhead = 1.0e6 ;
61+
62+ for (i = 0 ;i < 5 ;i ++ )
63+ {
64+ int j ;
65+ double overhead ;
66+
67+ memset (& instr ,0 ,sizeof (instr ) );
68+
69+ /* Loop SAMPLE_THRESHOLD times or 100 microseconds, whichever is faster */
70+ for (j = 0 ;j < SAMPLE_THRESHOLD && INSTR_TIME_GET_DOUBLE (instr .counter )< 100e-6 ;i ++ )
71+ {
72+ InstrStartNode (& instr );
73+ InstrStopNode (& instr ,1 );
74+ }
75+ overhead = INSTR_TIME_GET_DOUBLE (instr .counter ) /instr .samplecount ;
76+ if (overhead < SampleOverhead )
77+ SampleOverhead = overhead ;
78+ }
79+
80+ SampleOverheadCalculated = true;
81+ }
2082
2183/* Allocate new instrumentation structure(s) */
2284Instrumentation *
@@ -25,7 +87,10 @@ InstrAlloc(int n)
2587Instrumentation * instr = palloc0 (n * sizeof (Instrumentation ));
2688
2789/* we don't need to do any initialization except zero 'em */
28-
90+
91+ /* Calculate overhead, if not done yet */
92+ if ( !SampleOverheadCalculated )
93+ CalculateSampleOverhead ();
2994return instr ;
3095}
3196
3499InstrStartNode (Instrumentation * instr )
35100{
36101if (INSTR_TIME_IS_ZERO (instr -> starttime ))
37- INSTR_TIME_SET_CURRENT (instr -> starttime );
102+ {
103+ /* We always sample the first SAMPLE_THRESHOLD tuples, so small nodes are always accurate */
104+ if (instr -> tuplecount < SAMPLE_THRESHOLD )
105+ instr -> sampling = true;
106+ else
107+ {
108+ /* Otherwise we go to sampling, see the comments on SampleFunc at the top of the file */
109+ if (instr -> tuplecount > instr -> nextsample )
110+ {
111+ instr -> sampling = true;
112+ /* The doubling is so the random will average 1 over time */
113+ instr -> nextsample += 2.0 * SampleFunc (instr -> tuplecount )* (double )rand () / (double )RAND_MAX ;
114+ }
115+ }
116+ if (instr -> sampling )
117+ INSTR_TIME_SET_CURRENT (instr -> starttime );
118+ }
38119else
39120elog (DEBUG2 ,"InstrStartNode called twice in a row" );
40121}
41122
42123/* Exit from a plan node */
43124void
44- InstrStopNode (Instrumentation * instr ,bool returnedTuple )
125+ InstrStopNode (Instrumentation * instr ,double nTuples )
45126{
46127instr_time endtime ;
47128
48129/* count the returned tuples */
49- if (returnedTuple )
50- instr -> tuplecount += 1 ;
130+ instr -> tuplecount += nTuples ;
51131
52- if (INSTR_TIME_IS_ZERO ( instr -> starttime ) )
132+ if (instr -> sampling )
53133{
54- elog (DEBUG2 ,"InstrStopNode called without start" );
55- return ;
56- }
134+ if (INSTR_TIME_IS_ZERO (instr -> starttime ))
135+ {
136+ elog (DEBUG2 ,"InstrStopNode called without start" );
137+ return ;
138+ }
57139
58- INSTR_TIME_SET_CURRENT (endtime );
140+ INSTR_TIME_SET_CURRENT (endtime );
59141
60142#ifndef WIN32
61- instr -> counter .tv_sec += endtime .tv_sec - instr -> starttime .tv_sec ;
62- instr -> counter .tv_usec += endtime .tv_usec - instr -> starttime .tv_usec ;
63-
64- /* Normalize after each add to avoid overflow/underflow of tv_usec */
65- while (instr -> counter .tv_usec < 0 )
66- {
67- instr -> counter .tv_usec += 1000000 ;
68- instr -> counter .tv_sec -- ;
69- }
70- while (instr -> counter .tv_usec >=1000000 )
71- {
72- instr -> counter .tv_usec -= 1000000 ;
73- instr -> counter .tv_sec ++ ;
74- }
143+ instr -> counter .tv_sec += endtime .tv_sec - instr -> starttime .tv_sec ;
144+ instr -> counter .tv_usec += endtime .tv_usec - instr -> starttime .tv_usec ;
145+
146+ /* Normalize after each add to avoid overflow/underflow of tv_usec */
147+ while (instr -> counter .tv_usec < 0 )
148+ {
149+ instr -> counter .tv_usec += 1000000 ;
150+ instr -> counter .tv_sec -- ;
151+ }
152+ while (instr -> counter .tv_usec >=1000000 )
153+ {
154+ instr -> counter .tv_usec -= 1000000 ;
155+ instr -> counter .tv_sec ++ ;
156+ }
75157#else /* WIN32 */
76- instr -> counter .QuadPart += (endtime .QuadPart - instr -> starttime .QuadPart );
158+ instr -> counter .QuadPart += (endtime .QuadPart - instr -> starttime .QuadPart );
77159#endif
78160
79- INSTR_TIME_SET_ZERO (instr -> starttime );
161+ INSTR_TIME_SET_ZERO (instr -> starttime );
162+ instr -> samplecount += nTuples ;
163+ instr -> sampling = false;
164+ }
80165
81166/* Is this the first tuple of this cycle? */
82167if (!instr -> running )
@@ -86,17 +171,6 @@ InstrStopNode(Instrumentation *instr, bool returnedTuple)
86171}
87172}
88173
89- /* As above, but count multiple tuples returned at once */
90- void
91- InstrStopNodeMulti (Instrumentation * instr ,double nTuples )
92- {
93- /* count the returned tuples */
94- instr -> tuplecount += nTuples ;
95-
96- /* delegate the rest */
97- InstrStopNode (instr , false);
98- }
99-
100174/* Finish a run cycle for a plan node */
101175void
102176InstrEndLoop (Instrumentation * instr )
@@ -114,14 +188,31 @@ InstrEndLoop(Instrumentation *instr)
114188totaltime = INSTR_TIME_GET_DOUBLE (instr -> counter );
115189
116190instr -> startup += instr -> firsttuple ;
117- instr -> total += totaltime ;
191+
192+ /* Here we take into account sampling effects. Doing it naively ends
193+ * up assuming the sampling overhead applies to all tuples, even the
194+ * ones we didn't measure. We've calculated an overhead, so we
195+ * subtract that for all samples we didn't measure. The first tuple
196+ * is also special cased, because it usually takes longer. */
197+
198+ if (instr -> samplecount < instr -> tuplecount )
199+ {
200+ double pertuple = (totaltime - instr -> firsttuple ) / (instr -> samplecount - 1 );
201+ instr -> total += instr -> firsttuple + (pertuple * (instr -> samplecount - 1 ))
202+ + ((pertuple - SampleOverhead )* (instr -> tuplecount - instr -> samplecount ));
203+ }
204+ else
205+ instr -> total += totaltime ;
206+
118207instr -> ntuples += instr -> tuplecount ;
208+ instr -> nsamples += instr -> samplecount ;
119209instr -> nloops += 1 ;
120210
121211/* Reset for next cycle (if any) */
122212instr -> running = false;
123213INSTR_TIME_SET_ZERO (instr -> starttime );
124214INSTR_TIME_SET_ZERO (instr -> counter );
125215instr -> firsttuple = 0 ;
216+ instr -> samplecount = 0 ;
126217instr -> tuplecount = 0 ;
127218}