Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit6150a1b

Browse files
committed
Move buffer I/O and content LWLocks out of the main tranche.
Move the content lock directly into the BufferDesc, so that locking andpinning a buffer touches only one cache line rather than two. Adjustthe definition of BufferDesc slightly so that this doesn't make theBufferDesc any larger than one cache line (at least on platforms wherea spinlock is only 1 or 2 bytes).We can't fit the I/O locks into the BufferDesc and stay within onecache line, so move those to a completely separate tranche. Thisleaves a relatively limited number of LWLocks in the main tranche, soincrease the padding of those remaining locks to a full cache line,rather than allowing adjacent locks to share a cache line, hopefullyreducing false sharing.Performance testing shows that these changes make little differenceon laptop-class machines, but help significantly on larger servers,especially those with more than 2 sockets.Andres Freund, originally based on an earlier patch by Simon Riggs.Review and cosmetic adjustments (including heavy rewriting of thecomments) by me.
1 parent3fed417 commit6150a1b

File tree

6 files changed

+151
-67
lines changed

6 files changed

+151
-67
lines changed

‎src/backend/storage/buffer/buf_init.c

Lines changed: 51 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@
2020

2121
BufferDescPadded*BufferDescriptors;
2222
char*BufferBlocks;
23+
LWLockMinimallyPadded*BufferIOLWLockArray=NULL;
24+
LWLockTrancheBufferIOLWLockTranche;
25+
LWLockTrancheBufferContentLWLockTranche;
2326

2427

2528
/*
@@ -65,22 +68,45 @@ void
6568
InitBufferPool(void)
6669
{
6770
boolfoundBufs,
68-
foundDescs;
71+
foundDescs,
72+
foundIOLocks;
6973

7074
/* Align descriptors to a cacheline boundary. */
71-
BufferDescriptors= (BufferDescPadded*)CACHELINEALIGN(
72-
ShmemInitStruct("Buffer Descriptors",
73-
NBuffers*sizeof(BufferDescPadded)+PG_CACHE_LINE_SIZE,
74-
&foundDescs));
75+
BufferDescriptors= (BufferDescPadded*)
76+
CACHELINEALIGN(
77+
ShmemInitStruct("Buffer Descriptors",
78+
NBuffers*sizeof(BufferDescPadded)
79+
+PG_CACHE_LINE_SIZE,
80+
&foundDescs));
7581

7682
BufferBlocks= (char*)
7783
ShmemInitStruct("Buffer Blocks",
7884
NBuffers* (Size)BLCKSZ,&foundBufs);
7985

80-
if (foundDescs||foundBufs)
86+
/* Align lwlocks to cacheline boundary */
87+
BufferIOLWLockArray= (LWLockMinimallyPadded*)
88+
CACHELINEALIGN(ShmemInitStruct("Buffer IO Locks",
89+
NBuffers* (Size)sizeof(LWLockMinimallyPadded)
90+
+PG_CACHE_LINE_SIZE,
91+
&foundIOLocks));
92+
93+
BufferIOLWLockTranche.name="Buffer IO Locks";
94+
BufferIOLWLockTranche.array_base=BufferIOLWLockArray;
95+
BufferIOLWLockTranche.array_stride=sizeof(LWLockMinimallyPadded);
96+
LWLockRegisterTranche(LWTRANCHE_BUFFER_IO_IN_PROGRESS,
97+
&BufferIOLWLockTranche);
98+
99+
BufferContentLWLockTranche.name="Buffer Content Locks";
100+
BufferContentLWLockTranche.array_base=
101+
((char*)BufferDescriptors)+ offsetof(BufferDesc,content_lock);
102+
BufferContentLWLockTranche.array_stride=sizeof(BufferDescPadded);
103+
LWLockRegisterTranche(LWTRANCHE_BUFFER_CONTENT,
104+
&BufferContentLWLockTranche);
105+
106+
if (foundDescs||foundBufs||foundIOLocks)
81107
{
82-
/*bothshouldbe presentorneither */
83-
Assert(foundDescs&&foundBufs);
108+
/* shouldfind all of these,ornone of them */
109+
Assert(foundDescs&&foundBufs&&foundIOLocks);
84110
/* note: this path is only taken in EXEC_BACKEND case */
85111
}
86112
else
@@ -110,8 +136,11 @@ InitBufferPool(void)
110136
*/
111137
buf->freeNext=i+1;
112138

113-
buf->io_in_progress_lock=LWLockAssign();
114-
buf->content_lock=LWLockAssign();
139+
LWLockInitialize(BufferDescriptorGetContentLock(buf),
140+
LWTRANCHE_BUFFER_CONTENT);
141+
142+
LWLockInitialize(BufferDescriptorGetIOLock(buf),
143+
LWTRANCHE_BUFFER_IO_IN_PROGRESS);
115144
}
116145

117146
/* Correct last entry of linked list */
@@ -144,5 +173,17 @@ BufferShmemSize(void)
144173
/* size of stuff controlled by freelist.c */
145174
size=add_size(size,StrategyShmemSize());
146175

176+
/*
177+
* It would be nice to include the I/O locks in the BufferDesc, but that
178+
* would increase the size of a BufferDesc to more than one cache line, and
179+
* benchmarking has shown that keeping every BufferDesc aligned on a cache
180+
* line boundary is important for performance. So, instead, the array of
181+
* I/O locks is allocated in a separate tranche. Because those locks are
182+
* not highly contentended, we lay out the array with minimal padding.
183+
*/
184+
size=add_size(size,mul_size(NBuffers,sizeof(LWLockMinimallyPadded)));
185+
/* to allow aligning the above */
186+
size=add_size(size,PG_CACHE_LINE_SIZE);
187+
147188
returnsize;
148189
}

‎src/backend/storage/buffer/bufmgr.c

Lines changed: 30 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -738,7 +738,8 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
738738
if (!isLocalBuf)
739739
{
740740
if (mode==RBM_ZERO_AND_LOCK)
741-
LWLockAcquire(bufHdr->content_lock,LW_EXCLUSIVE);
741+
LWLockAcquire(BufferDescriptorGetContentLock(bufHdr),
742+
LW_EXCLUSIVE);
742743
elseif (mode==RBM_ZERO_AND_CLEANUP_LOCK)
743744
LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr));
744745
}
@@ -879,7 +880,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
879880
if ((mode==RBM_ZERO_AND_LOCK||mode==RBM_ZERO_AND_CLEANUP_LOCK)&&
880881
!isLocalBuf)
881882
{
882-
LWLockAcquire(bufHdr->content_lock,LW_EXCLUSIVE);
883+
LWLockAcquire(BufferDescriptorGetContentLock(bufHdr),LW_EXCLUSIVE);
883884
}
884885

885886
if (isLocalBuf)
@@ -1045,7 +1046,8 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
10451046
* happens to be trying to split the page the first one got from
10461047
* StrategyGetBuffer.)
10471048
*/
1048-
if (LWLockConditionalAcquire(buf->content_lock,LW_SHARED))
1049+
if (LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf),
1050+
LW_SHARED))
10491051
{
10501052
/*
10511053
* If using a nondefault strategy, and writing the buffer
@@ -1067,7 +1069,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
10671069
StrategyRejectBuffer(strategy,buf))
10681070
{
10691071
/* Drop lock/pin and loop around for another buffer */
1070-
LWLockRelease(buf->content_lock);
1072+
LWLockRelease(BufferDescriptorGetContentLock(buf));
10711073
UnpinBuffer(buf, true);
10721074
continue;
10731075
}
@@ -1080,7 +1082,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
10801082
smgr->smgr_rnode.node.relNode);
10811083

10821084
FlushBuffer(buf,NULL);
1083-
LWLockRelease(buf->content_lock);
1085+
LWLockRelease(BufferDescriptorGetContentLock(buf));
10841086

10851087
TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_DONE(forkNum,blockNum,
10861088
smgr->smgr_rnode.node.spcNode,
@@ -1395,7 +1397,7 @@ MarkBufferDirty(Buffer buffer)
13951397

13961398
Assert(BufferIsPinned(buffer));
13971399
/* unfortunately we can't check if the lock is held exclusively */
1398-
Assert(LWLockHeldByMe(bufHdr->content_lock));
1400+
Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
13991401

14001402
LockBufHdr(bufHdr);
14011403

@@ -1595,8 +1597,8 @@ UnpinBuffer(BufferDesc *buf, bool fixOwner)
15951597
if (ref->refcount==0)
15961598
{
15971599
/* I'd better not still hold any locks on the buffer */
1598-
Assert(!LWLockHeldByMe(buf->content_lock));
1599-
Assert(!LWLockHeldByMe(buf->io_in_progress_lock));
1600+
Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf)));
1601+
Assert(!LWLockHeldByMe(BufferDescriptorGetIOLock(buf)));
16001602

16011603
LockBufHdr(buf);
16021604

@@ -2116,11 +2118,11 @@ SyncOneBuffer(int buf_id, bool skip_recently_used)
21162118
* buffer is clean by the time we've locked it.)
21172119
*/
21182120
PinBuffer_Locked(bufHdr);
2119-
LWLockAcquire(bufHdr->content_lock,LW_SHARED);
2121+
LWLockAcquire(BufferDescriptorGetContentLock(bufHdr),LW_SHARED);
21202122

21212123
FlushBuffer(bufHdr,NULL);
21222124

2123-
LWLockRelease(bufHdr->content_lock);
2125+
LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
21242126
UnpinBuffer(bufHdr, true);
21252127

21262128
returnresult |BUF_WRITTEN;
@@ -2926,9 +2928,9 @@ FlushRelationBuffers(Relation rel)
29262928
(bufHdr->flags&BM_VALID)&& (bufHdr->flags&BM_DIRTY))
29272929
{
29282930
PinBuffer_Locked(bufHdr);
2929-
LWLockAcquire(bufHdr->content_lock,LW_SHARED);
2931+
LWLockAcquire(BufferDescriptorGetContentLock(bufHdr),LW_SHARED);
29302932
FlushBuffer(bufHdr,rel->rd_smgr);
2931-
LWLockRelease(bufHdr->content_lock);
2933+
LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
29322934
UnpinBuffer(bufHdr, true);
29332935
}
29342936
else
@@ -2978,9 +2980,9 @@ FlushDatabaseBuffers(Oid dbid)
29782980
(bufHdr->flags&BM_VALID)&& (bufHdr->flags&BM_DIRTY))
29792981
{
29802982
PinBuffer_Locked(bufHdr);
2981-
LWLockAcquire(bufHdr->content_lock,LW_SHARED);
2983+
LWLockAcquire(BufferDescriptorGetContentLock(bufHdr),LW_SHARED);
29822984
FlushBuffer(bufHdr,NULL);
2983-
LWLockRelease(bufHdr->content_lock);
2985+
LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
29842986
UnpinBuffer(bufHdr, true);
29852987
}
29862988
else
@@ -3004,7 +3006,7 @@ FlushOneBuffer(Buffer buffer)
30043006

30053007
bufHdr=GetBufferDescriptor(buffer-1);
30063008

3007-
Assert(LWLockHeldByMe(bufHdr->content_lock));
3009+
Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
30083010

30093011
FlushBuffer(bufHdr,NULL);
30103012
}
@@ -3101,7 +3103,7 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
31013103

31023104
Assert(GetPrivateRefCount(buffer)>0);
31033105
/* here, either share or exclusive lock is OK */
3104-
Assert(LWLockHeldByMe(bufHdr->content_lock));
3106+
Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
31053107

31063108
/*
31073109
* This routine might get called many times on the same page, if we are
@@ -3254,11 +3256,11 @@ LockBuffer(Buffer buffer, int mode)
32543256
buf=GetBufferDescriptor(buffer-1);
32553257

32563258
if (mode==BUFFER_LOCK_UNLOCK)
3257-
LWLockRelease(buf->content_lock);
3259+
LWLockRelease(BufferDescriptorGetContentLock(buf));
32583260
elseif (mode==BUFFER_LOCK_SHARE)
3259-
LWLockAcquire(buf->content_lock,LW_SHARED);
3261+
LWLockAcquire(BufferDescriptorGetContentLock(buf),LW_SHARED);
32603262
elseif (mode==BUFFER_LOCK_EXCLUSIVE)
3261-
LWLockAcquire(buf->content_lock,LW_EXCLUSIVE);
3263+
LWLockAcquire(BufferDescriptorGetContentLock(buf),LW_EXCLUSIVE);
32623264
else
32633265
elog(ERROR,"unrecognized buffer lock mode: %d",mode);
32643266
}
@@ -3279,7 +3281,8 @@ ConditionalLockBuffer(Buffer buffer)
32793281

32803282
buf=GetBufferDescriptor(buffer-1);
32813283

3282-
returnLWLockConditionalAcquire(buf->content_lock,LW_EXCLUSIVE);
3284+
returnLWLockConditionalAcquire(BufferDescriptorGetContentLock(buf),
3285+
LW_EXCLUSIVE);
32833286
}
32843287

32853288
/*
@@ -3489,8 +3492,8 @@ WaitIO(BufferDesc *buf)
34893492
UnlockBufHdr(buf);
34903493
if (!(sv_flags&BM_IO_IN_PROGRESS))
34913494
break;
3492-
LWLockAcquire(buf->io_in_progress_lock,LW_SHARED);
3493-
LWLockRelease(buf->io_in_progress_lock);
3495+
LWLockAcquire(BufferDescriptorGetIOLock(buf),LW_SHARED);
3496+
LWLockRelease(BufferDescriptorGetIOLock(buf));
34943497
}
34953498
}
34963499

@@ -3523,7 +3526,7 @@ StartBufferIO(BufferDesc *buf, bool forInput)
35233526
* Grab the io_in_progress lock so that other processes can wait for
35243527
* me to finish the I/O.
35253528
*/
3526-
LWLockAcquire(buf->io_in_progress_lock,LW_EXCLUSIVE);
3529+
LWLockAcquire(BufferDescriptorGetIOLock(buf),LW_EXCLUSIVE);
35273530

35283531
LockBufHdr(buf);
35293532

@@ -3537,7 +3540,7 @@ StartBufferIO(BufferDesc *buf, bool forInput)
35373540
* him to get unwedged.
35383541
*/
35393542
UnlockBufHdr(buf);
3540-
LWLockRelease(buf->io_in_progress_lock);
3543+
LWLockRelease(BufferDescriptorGetIOLock(buf));
35413544
WaitIO(buf);
35423545
}
35433546

@@ -3547,7 +3550,7 @@ StartBufferIO(BufferDesc *buf, bool forInput)
35473550
{
35483551
/* someone else already did the I/O */
35493552
UnlockBufHdr(buf);
3550-
LWLockRelease(buf->io_in_progress_lock);
3553+
LWLockRelease(BufferDescriptorGetIOLock(buf));
35513554
return false;
35523555
}
35533556

@@ -3595,7 +3598,7 @@ TerminateBufferIO(BufferDesc *buf, bool clear_dirty, int set_flag_bits)
35953598

35963599
InProgressBuf=NULL;
35973600

3598-
LWLockRelease(buf->io_in_progress_lock);
3601+
LWLockRelease(BufferDescriptorGetIOLock(buf));
35993602
}
36003603

36013604
/*
@@ -3620,7 +3623,7 @@ AbortBufferIO(void)
36203623
* we can use TerminateBufferIO. Anyone who's executing WaitIO on the
36213624
* buffer will be in a busy spin until we succeed in doing this.
36223625
*/
3623-
LWLockAcquire(buf->io_in_progress_lock,LW_EXCLUSIVE);
3626+
LWLockAcquire(BufferDescriptorGetIOLock(buf),LW_EXCLUSIVE);
36243627

36253628
LockBufHdr(buf);
36263629
Assert(buf->flags&BM_IO_IN_PROGRESS);

‎src/backend/storage/lmgr/lwlock.c

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -344,18 +344,15 @@ NumLWLocks(void)
344344
intnumLocks;
345345

346346
/*
347-
*Possibly this logic should be spread out amongtheaffected modules,
348-
*the same way that shmem space estimation is done.But for now, there
349-
*are few enough usersofLWLocks that we can get away with just keeping
350-
*the knowledge here.
347+
*Many users of LWLocks no longer reserve space inthemain array here,
348+
*but instead allocate separate tranches.The latter approach has the
349+
*advantageofallowing LWLOCK_STATS and LOCK_DEBUG output to produce
350+
*more useful output.
351351
*/
352352

353353
/* Predefined LWLocks */
354354
numLocks=NUM_FIXED_LWLOCKS;
355355

356-
/* bufmgr.c needs two for each shared buffer */
357-
numLocks+=2*NBuffers;
358-
359356
/* proc.c needs one for each backend or auxiliary process */
360357
numLocks+=MaxBackends+NUM_AUXILIARY_PROCS;
361358

@@ -423,6 +420,10 @@ CreateLWLocks(void)
423420
StaticAssertExpr(LW_VAL_EXCLUSIVE> (uint32)MAX_BACKENDS,
424421
"MAX_BACKENDS too big for lwlock.c");
425422

423+
StaticAssertExpr(sizeof(LWLock) <=LWLOCK_MINIMAL_SIZE&&
424+
sizeof(LWLock) <=LWLOCK_PADDED_SIZE,
425+
"Miscalculated LWLock padding");
426+
426427
if (!IsUnderPostmaster)
427428
{
428429
intnumLocks=NumLWLocks();

‎src/include/storage/buf_internals.h

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -115,8 +115,8 @@ typedef struct buftag
115115
* Note: buf_hdr_lock must be held to examine or change the tag, flags,
116116
* usage_count, refcount, or wait_backend_pid fields. buf_id field never
117117
* changes after initialization, so does not need locking. freeNext is
118-
* protected by the buffer_strategy_lock not buf_hdr_lock. TheLWLocks can take
119-
* care ofthemselves. The buf_hdr_lock is *not* used to control access to
118+
* protected by the buffer_strategy_lock not buf_hdr_lock. TheLWLock can
119+
*takecare ofitself. The buf_hdr_lock is *not* used to control access to
120120
* the data in the buffer!
121121
*
122122
* An exception is that if we have the buffer pinned, its tag can't change
@@ -133,22 +133,24 @@ typedef struct buftag
133133
*
134134
* We use this same struct for local buffer headers, but the lock fields
135135
* are not used and not all of the flag bits are useful either.
136+
*
137+
* Be careful to avoid increasing the size of the struct when adding or
138+
* reordering members. Keeping it below 64 bytes (the most common CPU
139+
* cache line size) is fairly important for performance.
136140
*/
137141
typedefstructBufferDesc
138142
{
139143
BufferTagtag;/* ID of page contained in buffer */
140144
BufFlagsflags;/* see bit definitions above */
141-
uint16usage_count;/* usage counter for clock sweep code */
145+
uint8usage_count;/* usage counter for clock sweep code */
146+
slock_tbuf_hdr_lock;/* protects a subset of fields, see above */
142147
unsignedrefcount;/* # of backends holding pins on buffer */
143148
intwait_backend_pid;/* backend PID of pin-count waiter */
144149

145-
slock_tbuf_hdr_lock;/* protects the above fields */
146-
147150
intbuf_id;/* buffer's index number (from 0) */
148151
intfreeNext;/* link in freelist chain */
149152

150-
LWLock*io_in_progress_lock;/* to wait for I/O to complete */
151-
LWLock*content_lock;/* to lock access to buffer contents */
153+
LWLockcontent_lock;/* to lock access to buffer contents */
152154
}BufferDesc;
153155

154156
/*
@@ -184,6 +186,13 @@ typedef union BufferDescPadded
184186

185187
#defineBufferDescriptorGetBuffer(bdesc) ((bdesc)->buf_id + 1)
186188

189+
#defineBufferDescriptorGetIOLock(bdesc) \
190+
(&(BufferIOLWLockArray[(bdesc)->buf_id]).lock)
191+
#defineBufferDescriptorGetContentLock(bdesc) \
192+
((LWLock*) (&(bdesc)->content_lock))
193+
194+
externPGDLLIMPORTLWLockMinimallyPadded*BufferIOLWLockArray;
195+
187196
/*
188197
* The freeNext field is either the index of the next freelist entry,
189198
* or one of these special values:

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp