Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit33cc5d8

Browse files
committed
Change s_lock to not use any zero-delay select() calls; these are just a
waste of cycles on single-CPU machines, and of dubious utility on multi-CPUmachines too.Tweak s_lock_stuck so that caller can specify timeout interval, andincrease interval before declaring stuck spinlock for buffer locks and XLOGlocks.On systems that have fdatasync(), use that rather than fsync() to sync WALlog writes. Ensure that WAL file is entirely allocated during XLogFileInit.
1 parent58c4ab9 commit33cc5d8

File tree

9 files changed

+284
-188
lines changed

9 files changed

+284
-188
lines changed

‎configure

Lines changed: 140 additions & 124 deletions
Large diffs are not rendered by default.

‎configure.in

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -772,7 +772,10 @@ PGAC_VAR_INT_TIMEZONE
772772
AC_FUNC_ACCEPT_ARGTYPES
773773
PGAC_FUNC_GETTIMEOFDAY_1ARG
774774

775-
AC_CHECK_FUNCS([fcvt getopt_long memmove pstat setproctitle setsid sigprocmask sysconf waitpid dlopen])
775+
AC_CHECK_FUNCS([fcvt getopt_long memmove pstat setproctitle setsid sigprocmask sysconf waitpid dlopen fdatasync])
776+
777+
dnl Check whether <unistd.h> declares fdatasync().
778+
AC_EGREP_HEADER(fdatasync, unistd.h, AC_DEFINE(HAVE_FDATASYNC_DECL))
776779

777780
AC_CACHE_CHECK([for PS_STRINGS], [pgac_cv_var_PS_STRINGS],
778781
[AC_TRY_LINK(

‎src/backend/access/transam/xlog.c

Lines changed: 47 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
* Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group
77
* Portions Copyright (c) 1994, Regents of the University of California
88
*
9-
* $Header: /cvsroot/pgsql/src/backend/access/transam/xlog.c,v 1.53 2001/02/13 20:40:25 vadim Exp $
9+
* $Header: /cvsroot/pgsql/src/backend/access/transam/xlog.c,v 1.54 2001/02/18 04:39:42 tgl Exp $
1010
*
1111
*-------------------------------------------------------------------------
1212
*/
@@ -39,6 +39,13 @@
3939

4040
#include"miscadmin.h"
4141

42+
43+
/* Max time to wait to acquire XLog activity locks */
44+
#defineXLOG_LOCK_TIMEOUT(5*60*1000000)/* 5 minutes */
45+
/* Max time to wait to acquire checkpoint lock */
46+
#defineCHECKPOINT_LOCK_TIMEOUT(10*60*1000000)/* 10 minutes */
47+
48+
4249
intXLOGbuffers=8;
4350
intXLOGfiles=0;/* how many files to pre-allocate */
4451
XLogRecPtrMyLastRecPtr= {0,0};
@@ -178,8 +185,8 @@ typedef struct BkpBlock
178185
/*
179186
* We break each log file in 16Mb segments
180187
*/
181-
#defineXLogSegSize(16*1024*1024)
182-
#defineXLogLastSeg(0xffffffff / XLogSegSize)
188+
#defineXLogSegSize((uint32) (16*1024*1024))
189+
#defineXLogLastSeg(((uint32)0xffffffff) / XLogSegSize)
183190
#defineXLogFileSize(XLogLastSeg * XLogSegSize)
184191

185192
#defineNextLogSeg(_logId,_logSeg)\
@@ -423,7 +430,7 @@ begin:;
423430
}
424431
}
425432
}
426-
S_LOCK_SLEEP(&(XLogCtl->insert_lck),i++);
433+
S_LOCK_SLEEP(&(XLogCtl->insert_lck),i++,XLOG_LOCK_TIMEOUT);
427434
if (!TAS(&(XLogCtl->insert_lck)))
428435
break;
429436
}
@@ -721,7 +728,7 @@ XLogFlush(XLogRecPtr record)
721728
break;
722729
}
723730
}
724-
S_LOCK_SLEEP(&(XLogCtl->lgwr_lck),spins++);
731+
S_LOCK_SLEEP(&(XLogCtl->lgwr_lck),spins++,XLOG_LOCK_TIMEOUT);
725732
}
726733

727734
if (logFile >=0&& (LgwrResult.Write.xlogid!=logId||
@@ -741,7 +748,7 @@ XLogFlush(XLogRecPtr record)
741748
logFile=XLogFileOpen(logId,logSeg, false);
742749
}
743750

744-
if (pg_fsync(logFile)!=0)
751+
if (pg_fdatasync(logFile)!=0)
745752
elog(STOP,"fsync(logfile %u seg %u) failed: %m",
746753
logId,logSeg);
747754
LgwrResult.Flush=LgwrResult.Write;
@@ -826,7 +833,7 @@ GetFreeXLBuffer()
826833
InitXLBuffer(curridx);
827834
return;
828835
}
829-
S_LOCK_SLEEP(&(XLogCtl->lgwr_lck),spins++);
836+
S_LOCK_SLEEP(&(XLogCtl->lgwr_lck),spins++,XLOG_LOCK_TIMEOUT);
830837
}
831838
}
832839

@@ -846,7 +853,7 @@ XLogWrite(char *buffer)
846853
{
847854
if (wcnt>0)
848855
{
849-
if (pg_fsync(logFile)!=0)
856+
if (pg_fdatasync(logFile)!=0)
850857
elog(STOP,"fsync(logfile %u seg %u) failed: %m",
851858
logId,logSeg);
852859
if (LgwrResult.Write.xlogid!=logId)
@@ -928,7 +935,7 @@ XLogWrite(char *buffer)
928935
if (XLByteLT(LgwrResult.Flush,LgwrRqst.Flush)&&
929936
XLByteLE(LgwrRqst.Flush,LgwrResult.Write))
930937
{
931-
if (pg_fsync(logFile)!=0)
938+
if (pg_fdatasync(logFile)!=0)
932939
elog(STOP,"fsync(logfile %u seg %u) failed: %m",
933940
logId,logSeg);
934941
LgwrResult.Flush=LgwrResult.Write;
@@ -948,13 +955,14 @@ XLogFileInit(uint32 log, uint32 seg, bool *usexistent)
948955
{
949956
charpath[MAXPGPATH];
950957
chartpath[MAXPGPATH];
958+
charzbuffer[BLCKSZ];
951959
intfd;
960+
intnbytes;
952961

953962
XLogFileName(path,log,seg);
954963

955964
/*
956-
* Try to use existent file (checkpoint maker
957-
* creates it sometime).
965+
* Try to use existent file (checkpoint maker creates it sometimes).
958966
*/
959967
if (*usexistent)
960968
{
@@ -963,7 +971,7 @@ XLogFileInit(uint32 log, uint32 seg, bool *usexistent)
963971
{
964972
if (errno!=ENOENT)
965973
elog(STOP,"InitOpen(logfile %u seg %u) failed: %m",
966-
logId,logSeg);
974+
logId,logSeg);
967975
}
968976
else
969977
return(fd);
@@ -979,33 +987,44 @@ XLogFileInit(uint32 log, uint32 seg, bool *usexistent)
979987
elog(STOP,"InitCreate(logfile %u seg %u) failed: %m",
980988
logId,logSeg);
981989

982-
if (lseek(fd,XLogSegSize-1,SEEK_SET)!= (off_t) (XLogSegSize-1))
983-
elog(STOP,"lseek(logfile %u seg %u) failed: %m",
984-
logId,logSeg);
985-
986-
if (write(fd,"",1)!=1)
987-
elog(STOP,"write(logfile %u seg %u) failed: %m",
988-
logId,logSeg);
990+
/*
991+
* Zero-fill the file. We have to do this the hard way to ensure that
992+
* all the file space has really been allocated --- on platforms that
993+
* allow "holes" in files, just seeking to the end doesn't allocate
994+
* intermediate space. This way, we know that we have all the space
995+
* and (after the fsync below) that all the indirect blocks are down
996+
* on disk. Therefore, fdatasync(2) will be sufficient to sync future
997+
* writes to the log file.
998+
*/
999+
MemSet(zbuffer,0,sizeof(zbuffer));
1000+
for (nbytes=0;nbytes<XLogSegSize;nbytes+=sizeof(zbuffer))
1001+
{
1002+
if ((int)write(fd,zbuffer,sizeof(zbuffer))!= (int)sizeof(zbuffer))
1003+
elog(STOP,"ZeroFill(logfile %u seg %u) failed: %m",
1004+
logId,logSeg);
1005+
}
9891006

9901007
if (pg_fsync(fd)!=0)
9911008
elog(STOP,"fsync(logfile %u seg %u) failed: %m",
9921009
logId,logSeg);
9931010

994-
if (lseek(fd,0,SEEK_SET)<0)
995-
elog(STOP,"lseek(logfile %u seg %u off %u) failed: %m",
996-
log,seg,0);
997-
9981011
close(fd);
9991012

1013+
/*
1014+
* Prefer link() to rename() here just to be sure that we don't overwrite
1015+
* an existing logfile. However, there shouldn't be one, so rename()
1016+
* is an acceptable substitute except for the truly paranoid.
1017+
*/
10001018
#ifndef__BEOS__
10011019
if (link(tpath,path)<0)
1020+
elog(STOP,"InitRelink(logfile %u seg %u) failed: %m",
1021+
logId,logSeg);
1022+
unlink(tpath);
10021023
#else
10031024
if (rename(tpath,path)<0)
1004-
#endif
10051025
elog(STOP,"InitRelink(logfile %u seg %u) failed: %m",
10061026
logId,logSeg);
1007-
1008-
unlink(tpath);
1027+
#endif
10091028

10101029
fd=BasicOpenFile(path,O_RDWR |PG_BINARY,S_IRUSR |S_IWUSR);
10111030
if (fd<0)
@@ -2101,7 +2120,8 @@ CreateCheckPoint(bool shutdown)
21012120
/* Grab lock, using larger than normal sleep between tries (1 sec) */
21022121
while (TAS(&(XLogCtl->chkp_lck)))
21032122
{
2104-
S_LOCK_SLEEP_INTERVAL(&(XLogCtl->chkp_lck),spins++,1000000);
2123+
S_LOCK_SLEEP_INTERVAL(&(XLogCtl->chkp_lck),spins++,
2124+
CHECKPOINT_LOCK_TIMEOUT,1000000);
21052125
}
21062126

21072127
memset(&checkPoint,0,sizeof(checkPoint));

‎src/backend/storage/buffer/bufmgr.c

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.106 2001/01/24 19:43:05 momjian Exp $
11+
* $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.107 2001/02/18 04:39:42 tgl Exp $
1212
*
1313
*-------------------------------------------------------------------------
1414
*/
@@ -1990,6 +1990,9 @@ UnlockBuffers(void)
19901990
}
19911991
}
19921992

1993+
/* Max time to wait to acquire a buffer read or write lock */
1994+
#defineBUFFER_LOCK_TIMEOUT(10*60*1000000)/* 10 minutes */
1995+
19931996
void
19941997
LockBuffer(Bufferbuffer,intmode)
19951998
{
@@ -2041,7 +2044,7 @@ LockBuffer(Buffer buffer, int mode)
20412044
{
20422045
S_UNLOCK(&(buf->cntx_lock));
20432046
RESUME_INTERRUPTS();
2044-
S_LOCK_SLEEP(&(buf->cntx_lock),i++);
2047+
S_LOCK_SLEEP(&(buf->cntx_lock),i++,BUFFER_LOCK_TIMEOUT);
20452048
HOLD_INTERRUPTS();
20462049
S_LOCK(&(buf->cntx_lock));
20472050
}
@@ -2069,7 +2072,7 @@ LockBuffer(Buffer buffer, int mode)
20692072
}
20702073
S_UNLOCK(&(buf->cntx_lock));
20712074
RESUME_INTERRUPTS();
2072-
S_LOCK_SLEEP(&(buf->cntx_lock),i++);
2075+
S_LOCK_SLEEP(&(buf->cntx_lock),i++,BUFFER_LOCK_TIMEOUT);
20732076
HOLD_INTERRUPTS();
20742077
S_LOCK(&(buf->cntx_lock));
20752078
}

‎src/backend/storage/buffer/s_lock.c

Lines changed: 40 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $Header: /cvsroot/pgsql/src/backend/storage/buffer/Attic/s_lock.c,v 1.32 2001/01/24 19:43:06 momjian Exp $
11+
* $Header: /cvsroot/pgsql/src/backend/storage/buffer/Attic/s_lock.c,v 1.33 2001/02/18 04:39:42 tgl Exp $
1212
*
1313
*-------------------------------------------------------------------------
1414
*/
@@ -21,23 +21,39 @@
2121
#include"storage/s_lock.h"
2222

2323

24-
/*
24+
/*----------
2525
* Each time we busy spin we select the next element of this array as the
2626
* number of microseconds to wait. This accomplishes pseudo random back-off.
27-
* Values are not critical but 10 milliseconds is a common platform
28-
* granularity.
2927
*
30-
* Total time to cycle through all 20 entries might be about .07 sec,
31-
* so the given value of S_MAX_BUSY results in timeout after ~70 sec.
28+
* Note that on most platforms, specified values will be rounded up to the
29+
* next multiple of a clock tick, which is often ten milliseconds (10000).
30+
* So, we are being way overoptimistic to assume that these different values
31+
* are really different, other than the last. But there are a few platforms
32+
* with better-than-usual timekeeping, and on these we will get pretty good
33+
* pseudo-random behavior.
34+
*
35+
* Total time to cycle through all 20 entries will be at least 100 msec,
36+
* more commonly (10 msec resolution) 220 msec, and on some platforms
37+
* as much as 420 msec (when the remainder of the current tick cycle is
38+
* ignored in deciding when to time out, as on FreeBSD and older Linuxen).
39+
* We use the 100msec figure to figure max_spins, so actual timeouts may
40+
* be as much as four times the nominal value, but will never be less.
41+
*----------
3242
*/
3343
#defineS_NSPINCYCLE20
34-
#defineS_MAX_BUSY1000 * S_NSPINCYCLE
3544

3645
ints_spincycle[S_NSPINCYCLE]=
37-
{0,0,0,0,10000,0,0,0,10000,0,
38-
0,10000,0,0,10000,0,10000,0,10000,10000
46+
{1,10,100,1000,
47+
10000,1000,1000,1000,
48+
10000,1000,1000,10000,
49+
1000,1000,10000,1000,
50+
10000,1000,10000,30000
3951
};
4052

53+
#defineAVG_SPINCYCLE5000/* average entry in microsec: 100ms / 20 */
54+
55+
#defineDEFAULT_TIMEOUT(100*1000000)/* default timeout: 100 sec */
56+
4157

4258
/*
4359
* s_lock_stuck() - complain about a stuck spinlock
@@ -58,34 +74,40 @@ s_lock_stuck(volatile slock_t *lock, const char *file, const int line)
5874
/*
5975
* s_lock_sleep() - sleep a pseudo-random amount of time, check for timeout
6076
*
61-
* Normally 'microsec' is 0, specifying to use the next s_spincycle[] value.
77+
* The 'timeout' is given in microsec, or may be 0 for "infinity". Note that
78+
* this will be a lower bound (a fairly loose lower bound, on most platforms).
79+
*
80+
* 'microsec' is the number of microsec to delay per loop. Normally
81+
* 'microsec' is 0, specifying to use the next s_spincycle[] value.
6282
* Some callers may pass a nonzero interval, specifying to use exactly that
6383
* delay value rather than a pseudo-random delay.
6484
*/
6585
void
66-
s_lock_sleep(unsignedspins,intmicrosec,
86+
s_lock_sleep(unsignedspins,inttimeout,intmicrosec,
6787
volatileslock_t*lock,
6888
constchar*file,constintline)
6989
{
7090
structtimevaldelay;
71-
unsignedmax_spins;
7291

7392
if (microsec>0)
7493
{
7594
delay.tv_sec=0;
7695
delay.tv_usec=microsec;
77-
/* two-minute timeout in this case */
78-
max_spins=120000000 /microsec;
7996
}
8097
else
8198
{
8299
delay.tv_sec=0;
83100
delay.tv_usec=s_spincycle[spins %S_NSPINCYCLE];
84-
max_spins=S_MAX_BUSY;
101+
microsec=AVG_SPINCYCLE;/* use average to figure timeout */
85102
}
86103

87-
if (spins>max_spins)
88-
s_lock_stuck(lock,file,line);
104+
if (timeout>0)
105+
{
106+
unsignedmax_spins=timeout /microsec;
107+
108+
if (spins>max_spins)
109+
s_lock_stuck(lock,file,line);
110+
}
89111

90112
(void)select(0,NULL,NULL,NULL,&delay);
91113
}
@@ -110,7 +132,7 @@ s_lock(volatile slock_t *lock, const char *file, const int line)
110132
*/
111133
while (TAS(lock))
112134
{
113-
s_lock_sleep(spins++,0,lock,file,line);
135+
s_lock_sleep(spins++,DEFAULT_TIMEOUT,0,lock,file,line);
114136
CHECK_FOR_INTERRUPTS();
115137
}
116138
}

‎src/backend/storage/file/fd.c

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
* Portions Copyright (c) 1994, Regents of the University of California
88
*
99
* IDENTIFICATION
10-
* $Header: /cvsroot/pgsql/src/backend/storage/file/fd.c,v 1.72 2001/02/17 01:00:04 tgl Exp $
10+
* $Header: /cvsroot/pgsql/src/backend/storage/file/fd.c,v 1.73 2001/02/18 04:39:42 tgl Exp $
1111
*
1212
* NOTES:
1313
*
@@ -193,7 +193,7 @@ static char *filepath(char *filename);
193193
staticlongpg_nofile(void);
194194

195195
/*
196-
* pg_fsync --- same as fsync except does nothing if-F switch was given
196+
* pg_fsync --- same as fsync except does nothing ifenableFsync is off
197197
*/
198198
int
199199
pg_fsync(intfd)
@@ -204,6 +204,26 @@ pg_fsync(int fd)
204204
return0;
205205
}
206206

207+
/*
208+
* pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
209+
*
210+
* Not all platforms have fdatasync; treat as fsync if not available.
211+
*/
212+
int
213+
pg_fdatasync(intfd)
214+
{
215+
if (enableFsync)
216+
{
217+
#ifdefHAVE_FDATASYNC
218+
returnfdatasync(fd);
219+
#else
220+
returnfsync(fd);
221+
#endif
222+
}
223+
else
224+
return0;
225+
}
226+
207227
/*
208228
* BasicOpenFile --- same as open(2) except can free other FDs if needed
209229
*

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp