99 *
1010 *
1111 * IDENTIFICATION
12- * $PostgreSQL: pgsql/src/backend/storage/lmgr/s_lock.c,v 1.38 2005/08/26 14:47:35 tgl Exp $
12+ * $PostgreSQL: pgsql/src/backend/storage/lmgr/s_lock.c,v 1.39 2005/10/11 20:41:32 tgl Exp $
1313 *
1414 *-------------------------------------------------------------------------
1515 */
2121#include "storage/s_lock.h"
2222#include "miscadmin.h"
2323
24+
25+ static int spins_per_delay = DEFAULT_SPINS_PER_DELAY ;
26+
27+
2428/*
2529 * s_lock_stuck() - complain about a stuck spinlock
2630 */
@@ -49,54 +53,67 @@ s_lock(volatile slock_t *lock, const char *file, int line)
4953 * We loop tightly for awhile, then delay using pg_usleep() and try
5054 * again. Preferably, "awhile" should be a small multiple of the
5155 * maximum time we expect a spinlock to be held. 100 iterations seems
52- * about right. In most multi-CPU scenarios, the spinlock is probably
53- * held by a process on another CPU and will be released before we
54- * finish 100 iterations. However, on a uniprocessor, the tight loop
55- * is just a waste of cycles, so don't iterate thousands of times.
56+ * about right as an initial guess. However, on a uniprocessor the
57+ * loop is a waste of cycles, while in a multi-CPU scenario it's usually
58+ * better to spin a bit longer than to call the kernel, so we try to
59+ * adapt the spin loop count depending on whether we seem to be in
60+ * a uniprocessor or multiprocessor.
61+ *
62+ * Note: you might think MIN_SPINS_PER_DELAY should be just 1, but you'd
63+ * be wrong; there are platforms where that can result in a "stuck
64+ * spinlock" failure. This has been seen particularly on Alphas; it
65+ * seems that the first TAS after returning from kernel space will always
66+ * fail on that hardware.
5667 *
5768 * Once we do decide to block, we use randomly increasing pg_usleep()
58- * delays. The first delay is10 msec, then the delay randomly
59- * increases to about one second, after which we reset to10 msec and
69+ * delays. The first delay is1 msec, then the delay randomly
70+ * increases to about one second, after which we reset to1 msec and
6071 * start again. The idea here is that in the presence of heavy
6172 * contention we need to increase the delay, else the spinlock holder
6273 * may never get to run and release the lock. (Consider situation
6374 * where spinlock holder has been nice'd down in priority by the
6475 * scheduler --- it will not get scheduled until all would-be
65- * acquirers are sleeping, so if we always use a10 -msec sleep, there
76+ * acquirers are sleeping, so if we always use a1 -msec sleep, there
6677 * is a real possibility of starvation.) But we can't just clamp the
6778 * delay to an upper bound, else it would take a long time to make a
6879 * reasonable number of tries.
6980 *
7081 * We time out and declare error after NUM_DELAYS delays (thus, exactly
7182 * that many tries). With the given settings, this will usually take
72- *3 or so minutes. It seems better to fix the total number of tries
83+ *2 or so minutes. It seems better to fix the total number of tries
7384 * (and thus the probability of unintended failure) than to fix the
7485 * total time spent.
7586 *
76- * The pg_usleep() delays are measured in centiseconds (0.01 sec) because
77- * 10 msec is a common resolution limit at the OS level.
87+ * The pg_usleep() delays are measured in milliseconds because 1 msec
88+ * is a common resolution limit at the OS level for newer platforms.
89+ * On older platforms the resolution limit is usually 10 msec, in
90+ * which case the total delay before timeout will be a bit more.
7891 */
79- #define SPINS_PER_DELAY 100
92+ #define MIN_SPINS_PER_DELAY 10
93+ #define MAX_SPINS_PER_DELAY 1000
8094#define NUM_DELAYS 1000
81- #define MIN_DELAY_CSEC 1
82- #define MAX_DELAY_CSEC 100
95+ #define MIN_DELAY_MSEC 1
96+ #define MAX_DELAY_MSEC 1000
8397
8498int spins = 0 ;
8599int delays = 0 ;
86- int cur_delay = MIN_DELAY_CSEC ;
100+ int cur_delay = 0 ;
87101
88102while (TAS (lock ))
89103{
90104/* CPU-specific delay each time through the loop */
91105SPIN_DELAY ();
92106
93- /* Block the process everySPINS_PER_DELAY tries */
94- if (++ spins > SPINS_PER_DELAY )
107+ /* Block the process everyspins_per_delay tries */
108+ if (++ spins >= spins_per_delay )
95109{
96110if (++ delays > NUM_DELAYS )
97111s_lock_stuck (lock ,file ,line );
98112
99- pg_usleep (cur_delay * 10000L );
113+ if (cur_delay == 0 )/* first time to delay? */
114+ cur_delay = MIN_DELAY_MSEC ;
115+
116+ pg_usleep (cur_delay * 1000L );
100117
101118#if defined(S_LOCK_TEST )
102119fprintf (stdout ,"*" );
@@ -107,14 +124,76 @@ s_lock(volatile slock_t *lock, const char *file, int line)
107124cur_delay += (int ) (cur_delay *
108125 (((double )random ()) / ((double )MAX_RANDOM_VALUE ))+ 0.5 );
109126/* wrap back to minimum delay when max is exceeded */
110- if (cur_delay > MAX_DELAY_CSEC )
111- cur_delay = MIN_DELAY_CSEC ;
127+ if (cur_delay > MAX_DELAY_MSEC )
128+ cur_delay = MIN_DELAY_MSEC ;
112129
113130spins = 0 ;
114131}
115132}
133+
134+ /*
135+ * If we were able to acquire the lock without delaying, it's a good
136+ * indication we are in a multiprocessor. If we had to delay, it's
137+ * a sign (but not a sure thing) that we are in a uniprocessor.
138+ * Hence, we decrement spins_per_delay slowly when we had to delay,
139+ * and increase it rapidly when we didn't. It's expected that
140+ * spins_per_delay will converge to the minimum value on a uniprocessor
141+ * and to the maximum value on a multiprocessor.
142+ *
143+ * Note: spins_per_delay is local within our current process.
144+ * We want to average these observations across multiple backends,
145+ * since it's relatively rare for this function to even get entered,
146+ * and so a single backend might not live long enough to converge on
147+ * a good value. That is handled by the two routines below.
148+ */
149+ if (cur_delay == 0 )
150+ {
151+ /* we never had to delay */
152+ if (spins_per_delay < MAX_SPINS_PER_DELAY )
153+ spins_per_delay = Min (spins_per_delay + 100 ,MAX_SPINS_PER_DELAY );
154+ }
155+ else
156+ {
157+ if (spins_per_delay > MIN_SPINS_PER_DELAY )
158+ spins_per_delay = Max (spins_per_delay - 1 ,MIN_SPINS_PER_DELAY );
159+ }
160+ }
161+
162+
163+ /*
164+ * Set local copy of spins_per_delay during backend startup.
165+ *
166+ * NB: this has to be pretty fast as it is called while holding a spinlock
167+ */
168+ void
169+ set_spins_per_delay (int shared_spins_per_delay )
170+ {
171+ spins_per_delay = shared_spins_per_delay ;
172+ }
173+
174+ /*
175+ * Update shared estimate of spins_per_delay during backend exit.
176+ *
177+ * NB: this has to be pretty fast as it is called while holding a spinlock
178+ */
179+ int
180+ update_spins_per_delay (int shared_spins_per_delay )
181+ {
182+ /*
183+ * We use an exponential moving average with a relatively slow
184+ * adaption rate, so that noise in any one backend's result won't
185+ * affect the shared value too much. As long as both inputs are
186+ * within the allowed range, the result must be too, so we need not
187+ * worry about clamping the result.
188+ *
189+ * We deliberately truncate rather than rounding; this is so that
190+ * single adjustments inside a backend can affect the shared estimate
191+ * (see the asymmetric adjustment rules above).
192+ */
193+ return (shared_spins_per_delay * 15 + spins_per_delay ) /16 ;
116194}
117195
196+
118197/*
119198 * Various TAS implementations that cannot live in s_lock.h as no inline
120199 * definition exists (yet).