Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitd872510

Browse files
committed
Replace buffer I/O locks with condition variables.
1. Backends waiting for buffer I/O are now interruptible.2. If something goes wrong in a backend that is currently performingI/O, waiting backends no longer wake up until that backend reachesAbortBufferIO() and broadcasts on the CV. Previously, any waiters wouldwake up (because the I/O lock was automatically released) and thenbusy-loop until AbortBufferIO() cleared BM_IO_IN_PROGRESS.3. LWLockMinimallyPadded is removed, as it would now be unused.Author: Robert Haas <robertmhaas@gmail.com>Reviewed-by: Thomas Munro <thomas.munro@gmail.com>Reviewed-by: Julien Rouhaud <rjuju123@gmail.com>Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> (earlier version, 2016)Discussion:https://postgr.es/m/CA%2BhUKGJ8nBFrjLuCTuqKN0pd2PQOwj9b_jnsiGFFMDvUxahj_A%40mail.gmail.comDiscussion:https://postgr.es/m/CA+Tgmoaj2aPti0yho7FeEf2qt-JgQPRWb0gci_o1Hfr=C56Xng@mail.gmail.com
1 parentc3ffe34 commitd872510

File tree

11 files changed

+61
-116
lines changed

11 files changed

+61
-116
lines changed

‎doc/src/sgml/monitoring.sgml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1586,6 +1586,10 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser
15861586
<entry>Waiting for the page number needed to continue a parallel B-tree
15871587
scan to become available.</entry>
15881588
</row>
1589+
<row>
1590+
<entry><literal>BufferIO</literal></entry>
1591+
<entry>Waiting for buffer I/O to complete.</entry>
1592+
</row>
15891593
<row>
15901594
<entry><literal>CheckpointDone</literal></entry>
15911595
<entry>Waiting for a checkpoint to complete.</entry>
@@ -1876,10 +1880,6 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser
18761880
<entry><literal>BufferContent</literal></entry>
18771881
<entry>Waiting to access a data page in memory.</entry>
18781882
</row>
1879-
<row>
1880-
<entry><literal>BufferIO</literal></entry>
1881-
<entry>Waiting for I/O on a data page.</entry>
1882-
</row>
18831883
<row>
18841884
<entry><literal>BufferMapping</literal></entry>
18851885
<entry>Waiting to associate a data block with a buffer in the buffer

‎src/backend/postmaster/pgstat.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4010,6 +4010,9 @@ pgstat_get_wait_ipc(WaitEventIPC w)
40104010
caseWAIT_EVENT_BTREE_PAGE:
40114011
event_name="BtreePage";
40124012
break;
4013+
caseWAIT_EVENT_BUFFER_IO:
4014+
event_name="BufferIO";
4015+
break;
40134016
caseWAIT_EVENT_CHECKPOINT_DONE:
40144017
event_name="CheckpointDone";
40154018
break;

‎src/backend/storage/buffer/README

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -145,14 +145,11 @@ held within the buffer. Each buffer header also contains an LWLock, the
145145
"buffer content lock", that *does* represent the right to access the data
146146
in the buffer. It is used per the rules above.
147147

148-
There is yet another set of per-buffer LWLocks, the io_in_progress locks,
149-
that are used to wait for I/O on a buffer to complete. The process doing
150-
a read or write takes exclusive lock for the duration, and processes that
151-
need to wait for completion try to take shared locks (which they release
152-
immediately upon obtaining). XXX on systems where an LWLock represents
153-
nontrivial resources, it's fairly annoying to need so many locks. Possibly
154-
we could use per-backend LWLocks instead (a buffer header would then contain
155-
a field to show which backend is doing its I/O).
148+
* The BM_IO_IN_PROGRESS flag acts as a kind of lock, used to wait for I/O on a
149+
buffer to complete (and in releases before 14, it was accompanied by a
150+
per-buffer LWLock). The process doing a read or write sets the flag for the
151+
duration, and processes that need to wait for it to be cleared sleep on a
152+
condition variable.
156153

157154

158155
Normal Buffer Replacement Strategy

‎src/backend/storage/buffer/buf_init.c

Lines changed: 13 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919

2020
BufferDescPadded*BufferDescriptors;
2121
char*BufferBlocks;
22-
LWLockMinimallyPadded*BufferIOLWLockArray=NULL;
22+
ConditionVariableMinimallyPadded*BufferIOCVArray;
2323
WritebackContextBackendWritebackContext;
2424
CkptSortItem*CkptBufferIds;
2525

@@ -68,7 +68,7 @@ InitBufferPool(void)
6868
{
6969
boolfoundBufs,
7070
foundDescs,
71-
foundIOLocks,
71+
foundIOCV,
7272
foundBufCkpt;
7373

7474
/* Align descriptors to a cacheline boundary. */
@@ -81,11 +81,11 @@ InitBufferPool(void)
8181
ShmemInitStruct("Buffer Blocks",
8282
NBuffers* (Size)BLCKSZ,&foundBufs);
8383

84-
/* Alignlwlocksto cacheline boundary */
85-
BufferIOLWLockArray= (LWLockMinimallyPadded*)
86-
ShmemInitStruct("Buffer IOLocks",
87-
NBuffers*(Size)sizeof(LWLockMinimallyPadded),
88-
&foundIOLocks);
84+
/* Aligncondition variablesto cacheline boundary. */
85+
BufferIOCVArray= (ConditionVariableMinimallyPadded*)
86+
ShmemInitStruct("Buffer IOCondition Variables",
87+
NBuffers*sizeof(ConditionVariableMinimallyPadded),
88+
&foundIOCV);
8989

9090
/*
9191
* The array used to sort to-be-checkpointed buffer ids is located in
@@ -98,10 +98,10 @@ InitBufferPool(void)
9898
ShmemInitStruct("Checkpoint BufferIds",
9999
NBuffers*sizeof(CkptSortItem),&foundBufCkpt);
100100

101-
if (foundDescs||foundBufs||foundIOLocks||foundBufCkpt)
101+
if (foundDescs||foundBufs||foundIOCV||foundBufCkpt)
102102
{
103103
/* should find all of these, or none of them */
104-
Assert(foundDescs&&foundBufs&&foundIOLocks&&foundBufCkpt);
104+
Assert(foundDescs&&foundBufs&&foundIOCV&&foundBufCkpt);
105105
/* note: this path is only taken in EXEC_BACKEND case */
106106
}
107107
else
@@ -131,8 +131,7 @@ InitBufferPool(void)
131131
LWLockInitialize(BufferDescriptorGetContentLock(buf),
132132
LWTRANCHE_BUFFER_CONTENT);
133133

134-
LWLockInitialize(BufferDescriptorGetIOLock(buf),
135-
LWTRANCHE_BUFFER_IO);
134+
ConditionVariableInit(BufferDescriptorGetIOCV(buf));
136135
}
137136

138137
/* Correct last entry of linked list */
@@ -169,16 +168,9 @@ BufferShmemSize(void)
169168
/* size of stuff controlled by freelist.c */
170169
size=add_size(size,StrategyShmemSize());
171170

172-
/*
173-
* It would be nice to include the I/O locks in the BufferDesc, but that
174-
* would increase the size of a BufferDesc to more than one cache line,
175-
* and benchmarking has shown that keeping every BufferDesc aligned on a
176-
* cache line boundary is important for performance. So, instead, the
177-
* array of I/O locks is allocated in a separate tranche. Because those
178-
* locks are not highly contended, we lay out the array with minimal
179-
* padding.
180-
*/
181-
size=add_size(size,mul_size(NBuffers,sizeof(LWLockMinimallyPadded)));
171+
/* size of I/O condition variables */
172+
size=add_size(size,mul_size(NBuffers,
173+
sizeof(ConditionVariableMinimallyPadded)));
182174
/* to allow aligning the above */
183175
size=add_size(size,PG_CACHE_LINE_SIZE);
184176

‎src/backend/storage/buffer/bufmgr.c

Lines changed: 18 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1352,9 +1352,10 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
13521352
LWLockRelease(newPartitionLock);
13531353

13541354
/*
1355-
* Buffer contents are currently invalid. Try to get the io_in_progress
1356-
* lock. If StartBufferIO returns false, then someone else managed to
1357-
* read it before we did, so there's nothing left for BufferAlloc() to do.
1355+
* Buffer contents are currently invalid. Try to obtain the right to
1356+
* start I/O. If StartBufferIO returns false, then someone else managed
1357+
* to read it before we did, so there's nothing left for BufferAlloc() to
1358+
* do.
13581359
*/
13591360
if (StartBufferIO(buf, true))
13601361
*foundPtr= false;
@@ -1777,9 +1778,8 @@ UnpinBuffer(BufferDesc *buf, bool fixOwner)
17771778
*/
17781779
VALGRIND_MAKE_MEM_NOACCESS(BufHdrGetBlock(buf),BLCKSZ);
17791780

1780-
/* I'd better not still holdany locks onthe buffer */
1781+
/* I'd better not still hold the buffer content lock */
17811782
Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf)));
1782-
Assert(!LWLockHeldByMe(BufferDescriptorGetIOLock(buf)));
17831783

17841784
/*
17851785
* Decrement the shared reference count.
@@ -2742,9 +2742,9 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln)
27422742
uint32buf_state;
27432743

27442744
/*
2745-
*Acquire the buffer's io_in_progress lock. If StartBufferIO returns
2746-
*false, thensomeone else flushed the buffer before we could, so we need
2747-
*not doanything.
2745+
*Try to start an I/O operation. If StartBufferIO returns false, then
2746+
* someone else flushed the buffer before we could, so we need not do
2747+
* anything.
27482748
*/
27492749
if (!StartBufferIO(buf, false))
27502750
return;
@@ -2800,7 +2800,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln)
28002800
/*
28012801
* Now it's safe to write buffer to disk. Note that no one else should
28022802
* have been able to write it while we were busy with log flushing because
2803-
*we havetheio_in_progress lock.
2803+
*only one process at a time can settheBM_IO_IN_PROGRESS bit.
28042804
*/
28052805
bufBlock=BufHdrGetBlock(buf);
28062806

@@ -2835,7 +2835,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln)
28352835

28362836
/*
28372837
* Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
2838-
* end theio_in_progress state.
2838+
* end theBM_IO_IN_PROGRESS state.
28392839
*/
28402840
TerminateBufferIO(buf, true,0);
28412841

@@ -4271,7 +4271,7 @@ IsBufferCleanupOK(Buffer buffer)
42714271
*Functions for buffer I/O handling
42724272
*
42734273
*Note: We assume that nested buffer I/O never occurs.
4274-
*i.e at most oneio_in_progress lock isheld per proc.
4274+
*i.e at most oneBM_IO_IN_PROGRESS bit isset per proc.
42754275
*
42764276
*Also note that these are used only for shared buffers, not local ones.
42774277
*/
@@ -4282,13 +4282,9 @@ IsBufferCleanupOK(Buffer buffer)
42824282
staticvoid
42834283
WaitIO(BufferDesc*buf)
42844284
{
4285-
/*
4286-
* Changed to wait until there's no IO - Inoue 01/13/2000
4287-
*
4288-
* Note this is *necessary* because an error abort in the process doing
4289-
* I/O could release the io_in_progress_lock prematurely. See
4290-
* AbortBufferIO.
4291-
*/
4285+
ConditionVariable*cv=BufferDescriptorGetIOCV(buf);
4286+
4287+
ConditionVariablePrepareToSleep(cv);
42924288
for (;;)
42934289
{
42944290
uint32buf_state;
@@ -4303,9 +4299,9 @@ WaitIO(BufferDesc *buf)
43034299

43044300
if (!(buf_state&BM_IO_IN_PROGRESS))
43054301
break;
4306-
LWLockAcquire(BufferDescriptorGetIOLock(buf),LW_SHARED);
4307-
LWLockRelease(BufferDescriptorGetIOLock(buf));
4302+
ConditionVariableSleep(cv,WAIT_EVENT_BUFFER_IO);
43084303
}
4304+
ConditionVariableCancelSleep();
43094305
}
43104306

43114307
/*
@@ -4317,7 +4313,7 @@ WaitIO(BufferDesc *buf)
43174313
* In some scenarios there are race conditions in which multiple backends
43184314
* could attempt the same I/O operation concurrently. If someone else
43194315
* has already started I/O on this buffer then we will block on the
4320-
*io_in_progress lock until he's done.
4316+
*I/O condition variable until he's done.
43214317
*
43224318
* Input operations are only attempted on buffers that are not BM_VALID,
43234319
* and output operations only on buffers that are BM_VALID and BM_DIRTY,
@@ -4335,25 +4331,11 @@ StartBufferIO(BufferDesc *buf, bool forInput)
43354331

43364332
for (;;)
43374333
{
4338-
/*
4339-
* Grab the io_in_progress lock so that other processes can wait for
4340-
* me to finish the I/O.
4341-
*/
4342-
LWLockAcquire(BufferDescriptorGetIOLock(buf),LW_EXCLUSIVE);
4343-
43444334
buf_state=LockBufHdr(buf);
43454335

43464336
if (!(buf_state&BM_IO_IN_PROGRESS))
43474337
break;
4348-
4349-
/*
4350-
* The only way BM_IO_IN_PROGRESS could be set when the io_in_progress
4351-
* lock isn't held is if the process doing the I/O is recovering from
4352-
* an error (see AbortBufferIO). If that's the case, we must wait for
4353-
* him to get unwedged.
4354-
*/
43554338
UnlockBufHdr(buf,buf_state);
4356-
LWLockRelease(BufferDescriptorGetIOLock(buf));
43574339
WaitIO(buf);
43584340
}
43594341

@@ -4363,7 +4345,6 @@ StartBufferIO(BufferDesc *buf, bool forInput)
43634345
{
43644346
/* someone else already did the I/O */
43654347
UnlockBufHdr(buf,buf_state);
4366-
LWLockRelease(BufferDescriptorGetIOLock(buf));
43674348
return false;
43684349
}
43694350

@@ -4381,7 +4362,6 @@ StartBufferIO(BufferDesc *buf, bool forInput)
43814362
*(Assumptions)
43824363
*My process is executing IO for the buffer
43834364
*BM_IO_IN_PROGRESS bit is set for the buffer
4384-
*We hold the buffer's io_in_progress lock
43854365
*The buffer is Pinned
43864366
*
43874367
* If clear_dirty is true and BM_JUST_DIRTIED is not set, we clear the
@@ -4413,7 +4393,7 @@ TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
44134393

44144394
InProgressBuf=NULL;
44154395

4416-
LWLockRelease(BufferDescriptorGetIOLock(buf));
4396+
ConditionVariableBroadcast(BufferDescriptorGetIOCV(buf));
44174397
}
44184398

44194399
/*
@@ -4434,14 +4414,6 @@ AbortBufferIO(void)
44344414
{
44354415
uint32buf_state;
44364416

4437-
/*
4438-
* Since LWLockReleaseAll has already been called, we're not holding
4439-
* the buffer's io_in_progress_lock. We have to re-acquire it so that
4440-
* we can use TerminateBufferIO. Anyone who's executing WaitIO on the
4441-
* buffer will be in a busy spin until we succeed in doing this.
4442-
*/
4443-
LWLockAcquire(BufferDescriptorGetIOLock(buf),LW_EXCLUSIVE);
4444-
44454417
buf_state=LockBufHdr(buf);
44464418
Assert(buf_state&BM_IO_IN_PROGRESS);
44474419
if (IsForInput)

‎src/backend/storage/lmgr/lwlock.c

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -146,8 +146,6 @@ static const char *const BuiltinTrancheNames[] = {
146146
"WALInsert",
147147
/* LWTRANCHE_BUFFER_CONTENT: */
148148
"BufferContent",
149-
/* LWTRANCHE_BUFFER_IO: */
150-
"BufferIO",
151149
/* LWTRANCHE_REPLICATION_ORIGIN_STATE: */
152150
"ReplicationOriginState",
153151
/* LWTRANCHE_REPLICATION_SLOT_IO: */
@@ -469,8 +467,7 @@ CreateLWLocks(void)
469467
StaticAssertStmt(LW_VAL_EXCLUSIVE> (uint32)MAX_BACKENDS,
470468
"MAX_BACKENDS too big for lwlock.c");
471469

472-
StaticAssertStmt(sizeof(LWLock) <=LWLOCK_MINIMAL_SIZE&&
473-
sizeof(LWLock) <=LWLOCK_PADDED_SIZE,
470+
StaticAssertStmt(sizeof(LWLock) <=LWLOCK_PADDED_SIZE,
474471
"Miscalculated LWLock padding");
475472

476473
if (!IsUnderPostmaster)

‎src/include/pgstat.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -971,6 +971,7 @@ typedef enum
971971
WAIT_EVENT_BGWORKER_SHUTDOWN,
972972
WAIT_EVENT_BGWORKER_STARTUP,
973973
WAIT_EVENT_BTREE_PAGE,
974+
WAIT_EVENT_BUFFER_IO,
974975
WAIT_EVENT_CHECKPOINT_DONE,
975976
WAIT_EVENT_CHECKPOINT_START,
976977
WAIT_EVENT_EXECUTE_GATHER,

‎src/include/storage/buf_internals.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include"port/atomics.h"
1919
#include"storage/buf.h"
2020
#include"storage/bufmgr.h"
21+
#include"storage/condition_variable.h"
2122
#include"storage/latch.h"
2223
#include"storage/lwlock.h"
2324
#include"storage/shmem.h"
@@ -184,7 +185,6 @@ typedef struct BufferDesc
184185

185186
intwait_backend_pid;/* backend PID of pin-count waiter */
186187
intfreeNext;/* link in freelist chain */
187-
188188
LWLockcontent_lock;/* to lock access to buffer contents */
189189
}BufferDesc;
190190

@@ -221,12 +221,12 @@ typedef union BufferDescPadded
221221

222222
#defineBufferDescriptorGetBuffer(bdesc) ((bdesc)->buf_id + 1)
223223

224-
#defineBufferDescriptorGetIOLock(bdesc) \
225-
(&(BufferIOLWLockArray[(bdesc)->buf_id]).lock)
224+
#defineBufferDescriptorGetIOCV(bdesc) \
225+
(&(BufferIOCVArray[(bdesc)->buf_id]).cv)
226226
#defineBufferDescriptorGetContentLock(bdesc) \
227227
((LWLock*) (&(bdesc)->content_lock))
228228

229-
externPGDLLIMPORTLWLockMinimallyPadded*BufferIOLWLockArray;
229+
externPGDLLIMPORTConditionVariableMinimallyPadded*BufferIOCVArray;
230230

231231
/*
232232
* The freeNext field is either the index of the next freelist entry,

‎src/include/storage/condition_variable.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,17 @@ typedef struct
3131
proclist_headwakeup;/* list of wake-able processes */
3232
}ConditionVariable;
3333

34+
/*
35+
* Pad a condition variable to a power-of-two size so that an array of
36+
* condition variables does not cross a cache line boundary.
37+
*/
38+
#defineCV_MINIMAL_SIZE(sizeof(ConditionVariable) <= 16 ? 16 : 32)
39+
typedefunionConditionVariableMinimallyPadded
40+
{
41+
ConditionVariablecv;
42+
charpad[CV_MINIMAL_SIZE];
43+
}ConditionVariableMinimallyPadded;
44+
3445
/* Initialize a condition variable. */
3546
externvoidConditionVariableInit(ConditionVariable*cv);
3647

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp