Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit73a3b1d

Browse files
committed
Backport fsync queue compaction logic to all supported branches.
This backports commit7f242d8,except for the counter in pg_stat_bgwriter. The underlying problem(namely, that a full fsync request queue causes terrible checkpointbehavior) continues to be reported in the wild, and this code seemsto be safe and robust enough to risk back-porting the fix.
1 parentd8bd584 commit73a3b1d

File tree

2 files changed

+127
-10
lines changed

2 files changed

+127
-10
lines changed

‎src/backend/postmaster/bgwriter.c

Lines changed: 120 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,7 @@ static void CheckArchiveTimeout(void);
179179
staticvoidBgWriterNap(void);
180180
staticboolIsCheckpointOnSchedule(doubleprogress);
181181
staticboolImmediateCheckpointRequested(void);
182+
staticboolCompactBgwriterRequestQueue(void);
182183

183184
/* Signal handlers */
184185

@@ -1061,14 +1062,15 @@ RequestCheckpoint(int flags)
10611062
* use high values for special flags; that's all internal to md.c, which
10621063
* see for details.)
10631064
*
1064-
* If we are unable to pass over the request (at present, this can happen
1065-
* if the shared memory queue is full), we return false. That forces
1066-
* the backend to do its own fsync. We hope that will be even more seldom.
1067-
*
1068-
* Note: we presently make no attempt to eliminate duplicate requests
1069-
* in the requests[] queue. The bgwriter will have to eliminate dups
1070-
* internally anyway, so we may as well avoid holding the lock longer
1071-
* than we have to here.
1065+
* To avoid holding the lock for longer than necessary, we normally write
1066+
* to the requests[] queue without checking for duplicates. The bgwriter
1067+
* will have to eliminate dups internally anyway. However, if we discover
1068+
* that the queue is full, we make a pass over the entire queue to compact
1069+
* it. This is somewhat expensive, but the alternative is for the backend
1070+
* to perform its own fsync, which is far more expensive in practice. It
1071+
* is theoretically possible a backend fsync might still be necessary, if
1072+
* the queue is full and contains no duplicate entries. In that case, we
1073+
* let the backend know by returning false.
10721074
*/
10731075
bool
10741076
ForwardFsyncRequest(RelFileNodernode,ForkNumberforknum,BlockNumbersegno)
@@ -1086,8 +1088,15 @@ ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
10861088
/* we count non-bgwriter writes even when the request queue overflows */
10871089
BgWriterShmem->num_backend_writes++;
10881090

1091+
/*
1092+
* If the background writer isn't running or the request queue is full,
1093+
* the backend will have to perform its own fsync request. But before
1094+
* forcing that to happen, we can try to compact the background writer
1095+
* request queue.
1096+
*/
10891097
if (BgWriterShmem->bgwriter_pid==0||
1090-
BgWriterShmem->num_requests >=BgWriterShmem->max_requests)
1098+
(BgWriterShmem->num_requests >=BgWriterShmem->max_requests
1099+
&& !CompactBgwriterRequestQueue()))
10911100
{
10921101
LWLockRelease(BgWriterCommLock);
10931102
return false;
@@ -1100,6 +1109,108 @@ ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
11001109
return true;
11011110
}
11021111

1112+
/*
1113+
* CompactBgwriterRequestQueue
1114+
* Remove duplicates from the request queue to avoid backend fsyncs.
1115+
*
1116+
* Although a full fsync request queue is not common, it can lead to severe
1117+
* performance problems when it does happen. So far, this situation has
1118+
* only been observed to occur when the system is under heavy write load,
1119+
* and especially during the "sync" phase of a checkpoint. Without this
1120+
* logic, each backend begins doing an fsync for every block written, which
1121+
* gets very expensive and can slow down the whole system.
1122+
*
1123+
* Trying to do this every time the queue is full could lose if there
1124+
* aren't any removable entries. But should be vanishingly rare in
1125+
* practice: there's one queue entry per shared buffer.
1126+
*/
1127+
staticbool
1128+
CompactBgwriterRequestQueue()
1129+
{
1130+
structBgWriterSlotMapping {
1131+
BgWriterRequestrequest;
1132+
intslot;
1133+
};
1134+
1135+
intn,
1136+
preserve_count;
1137+
intnum_skipped=0;
1138+
HASHCTLctl;
1139+
HTAB*htab;
1140+
bool*skip_slot;
1141+
1142+
/* must hold BgWriterCommLock in exclusive mode */
1143+
Assert(LWLockHeldByMe(BgWriterCommLock));
1144+
1145+
/* Initialize temporary hash table */
1146+
MemSet(&ctl,0,sizeof(ctl));
1147+
ctl.keysize=sizeof(BgWriterRequest);
1148+
ctl.entrysize=sizeof(structBgWriterSlotMapping);
1149+
ctl.hash=tag_hash;
1150+
htab=hash_create("CompactBgwriterRequestQueue",
1151+
BgWriterShmem->num_requests,
1152+
&ctl,
1153+
HASH_ELEM |HASH_FUNCTION);
1154+
1155+
/* Initialize skip_slot array */
1156+
skip_slot=palloc0(sizeof(bool)*BgWriterShmem->num_requests);
1157+
1158+
/*
1159+
* The basic idea here is that a request can be skipped if it's followed
1160+
* by a later, identical request. It might seem more sensible to work
1161+
* backwards from the end of the queue and check whether a request is
1162+
* *preceded* by an earlier, identical request, in the hopes of doing less
1163+
* copying. But that might change the semantics, if there's an
1164+
* intervening FORGET_RELATION_FSYNC or FORGET_DATABASE_FSYNC request, so
1165+
* we do it this way. It would be possible to be even smarter if we made
1166+
* the code below understand the specific semantics of such requests (it
1167+
* could blow away preceding entries that would end up being cancelled
1168+
* anyhow), but it's not clear that the extra complexity would buy us
1169+
* anything.
1170+
*/
1171+
for (n=0;n<BgWriterShmem->num_requests;++n)
1172+
{
1173+
BgWriterRequest*request;
1174+
structBgWriterSlotMapping*slotmap;
1175+
boolfound;
1176+
1177+
request=&BgWriterShmem->requests[n];
1178+
slotmap=hash_search(htab,request,HASH_ENTER,&found);
1179+
if (found)
1180+
{
1181+
skip_slot[slotmap->slot]= true;
1182+
++num_skipped;
1183+
}
1184+
slotmap->slot=n;
1185+
}
1186+
1187+
/* Done with the hash table. */
1188+
hash_destroy(htab);
1189+
1190+
/* If no duplicates, we're out of luck. */
1191+
if (!num_skipped)
1192+
{
1193+
pfree(skip_slot);
1194+
return false;
1195+
}
1196+
1197+
/* We found some duplicates; remove them. */
1198+
for (n=0,preserve_count=0;n<BgWriterShmem->num_requests;++n)
1199+
{
1200+
if (skip_slot[n])
1201+
continue;
1202+
BgWriterShmem->requests[preserve_count++]=BgWriterShmem->requests[n];
1203+
}
1204+
ereport(DEBUG1,
1205+
(errmsg("compacted fsync request queue from %d entries to %d entries",
1206+
BgWriterShmem->num_requests,preserve_count)));
1207+
BgWriterShmem->num_requests=preserve_count;
1208+
1209+
/* Cleanup. */
1210+
pfree(skip_slot);
1211+
return true;
1212+
}
1213+
11031214
/*
11041215
* AbsorbFsyncRequests
11051216
*Retrieve queued fsync requests and pass them to local smgr.

‎src/backend/storage/smgr/md.c

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,13 @@
3333
/* interval for calling AbsorbFsyncRequests in mdsync */
3434
#defineFSYNCS_PER_ABSORB10
3535

36-
/* special values for the segno arg to RememberFsyncRequest */
36+
/*
37+
* Special values for the segno arg to RememberFsyncRequest.
38+
*
39+
* Note that CompactBgwriterRequestQueue assumes that it's OK to remove an
40+
* fsync request from the queue if an identical, subsequent request is found.
41+
* See comments there before making changes here.
42+
*/
3743
#defineFORGET_RELATION_FSYNC(InvalidBlockNumber)
3844
#defineFORGET_DATABASE_FSYNC(InvalidBlockNumber-1)
3945
#defineUNLINK_RELATION_REQUEST (InvalidBlockNumber-2)

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp