NotificationsYou must be signed in to change notification settings
Fork6
Star31

Commit73a3b1d

committed

Backport fsync queue compaction logic to all supported branches.

This backports commit7f242d8,except for the counter in pg_stat_bgwriter. The underlying problem(namely, that a full fsync request queue causes terrible checkpointbehavior) continues to be reported in the wild, and this code seemsto be safe and robust enough to risk back-porting the fix.

1 parentd8bd584 commit73a3b1dCopy full SHA for 73a3b1d

File tree

2 files changed

+127

-10

lines changed

src/backend
- postmaster
  - bgwriter.c
- storage/smgr
  - md.c

2 files changed

+127

-10

lines changed

`‎src/backend/postmaster/bgwriter.c‎`

Lines changed: 120 additions & 9 deletions

Original file line number	Diff line number	Diff line change
`@@ -179,6 +179,7 @@ static void CheckArchiveTimeout(void);`
`179`	`179`	`staticvoidBgWriterNap(void);`
`180`	`180`	`staticboolIsCheckpointOnSchedule(doubleprogress);`
`181`	`181`	`staticboolImmediateCheckpointRequested(void);`
	`182`	`+staticboolCompactBgwriterRequestQueue(void);`
`182`	`183`
`183`	`184`	`/* Signal handlers */`
`184`	`185`
`@@ -1061,14 +1062,15 @@ RequestCheckpoint(int flags)`
`1061`	`1062`	`* use high values for special flags; that's all internal to md.c, which`
`1062`	`1063`	`* see for details.)`
`1063`	`1064`	`*`
`1064`		`- * If we are unable to pass over the request (at present, this can happen`
`1065`		`- * if the shared memory queue is full), we return false. That forces`
`1066`		`- * the backend to do its own fsync. We hope that will be even more seldom.`
`1067`		`- *`
`1068`		`- * Note: we presently make no attempt to eliminate duplicate requests`
`1069`		`- * in the requests[] queue. The bgwriter will have to eliminate dups`
`1070`		`- * internally anyway, so we may as well avoid holding the lock longer`
`1071`		`- * than we have to here.`
	`1065`	`+ * To avoid holding the lock for longer than necessary, we normally write`
	`1066`	`+ * to the requests[] queue without checking for duplicates. The bgwriter`
	`1067`	`+ * will have to eliminate dups internally anyway. However, if we discover`
	`1068`	`+ * that the queue is full, we make a pass over the entire queue to compact`
	`1069`	`+ * it. This is somewhat expensive, but the alternative is for the backend`
	`1070`	`+ * to perform its own fsync, which is far more expensive in practice. It`
	`1071`	`+ * is theoretically possible a backend fsync might still be necessary, if`
	`1072`	`+ * the queue is full and contains no duplicate entries. In that case, we`
	`1073`	`+ * let the backend know by returning false.`
`1072`	`1074`	`*/`
`1073`	`1075`	`bool`
`1074`	`1076`	`ForwardFsyncRequest(RelFileNodernode,ForkNumberforknum,BlockNumbersegno)`
`@@ -1086,8 +1088,15 @@ ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)`
`1086`	`1088`	`/* we count non-bgwriter writes even when the request queue overflows */`
`1087`	`1089`	`BgWriterShmem->num_backend_writes++;`
`1088`	`1090`
	`1091`	`+/*`
	`1092`	`+ * If the background writer isn't running or the request queue is full,`
	`1093`	`+ * the backend will have to perform its own fsync request. But before`
	`1094`	`+ * forcing that to happen, we can try to compact the background writer`
	`1095`	`+ * request queue.`
	`1096`	`+ */`
`1089`	`1097`	`if (BgWriterShmem->bgwriter_pid==0\|\|`
`1090`		`-BgWriterShmem->num_requests >=BgWriterShmem->max_requests)`
	`1098`	`+(BgWriterShmem->num_requests >=BgWriterShmem->max_requests`
	`1099`	`+&& !CompactBgwriterRequestQueue()))`
`1091`	`1100`	`{`
`1092`	`1101`	`LWLockRelease(BgWriterCommLock);`
`1093`	`1102`	`return false;`
`@@ -1100,6 +1109,108 @@ ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)`
`1100`	`1109`	`return true;`
`1101`	`1110`	`}`
`1102`	`1111`
	`1112`	`+/*`
	`1113`	`+ * CompactBgwriterRequestQueue`
	`1114`	`+ * Remove duplicates from the request queue to avoid backend fsyncs.`
	`1115`	`+ *`
	`1116`	`+ * Although a full fsync request queue is not common, it can lead to severe`
	`1117`	`+ * performance problems when it does happen. So far, this situation has`
	`1118`	`+ * only been observed to occur when the system is under heavy write load,`
	`1119`	`+ * and especially during the "sync" phase of a checkpoint. Without this`
	`1120`	`+ * logic, each backend begins doing an fsync for every block written, which`
	`1121`	`+ * gets very expensive and can slow down the whole system.`
	`1122`	`+ *`
	`1123`	`+ * Trying to do this every time the queue is full could lose if there`
	`1124`	`+ * aren't any removable entries. But should be vanishingly rare in`
	`1125`	`+ * practice: there's one queue entry per shared buffer.`
	`1126`	`+ */`
	`1127`	`+staticbool`
	`1128`	`+CompactBgwriterRequestQueue()`
	`1129`	`+{`
	`1130`	`+structBgWriterSlotMapping {`
	`1131`	`+BgWriterRequestrequest;`
	`1132`	`+intslot;`
	`1133`	`+};`
	`1134`	`+`
	`1135`	`+intn,`
	`1136`	`+preserve_count;`
	`1137`	`+intnum_skipped=0;`
	`1138`	`+HASHCTLctl;`
	`1139`	`+HTAB*htab;`
	`1140`	`+bool*skip_slot;`
	`1141`	`+`
	`1142`	`+/* must hold BgWriterCommLock in exclusive mode */`
	`1143`	`+Assert(LWLockHeldByMe(BgWriterCommLock));`
	`1144`	`+`
	`1145`	`+/* Initialize temporary hash table */`
	`1146`	`+MemSet(&ctl,0,sizeof(ctl));`
	`1147`	`+ctl.keysize=sizeof(BgWriterRequest);`
	`1148`	`+ctl.entrysize=sizeof(structBgWriterSlotMapping);`
	`1149`	`+ctl.hash=tag_hash;`
	`1150`	`+htab=hash_create("CompactBgwriterRequestQueue",`
	`1151`	`+BgWriterShmem->num_requests,`
	`1152`	`+&ctl,`
	`1153`	`+HASH_ELEM \|HASH_FUNCTION);`
	`1154`	`+`
	`1155`	`+/* Initialize skip_slot array */`
	`1156`	`+skip_slot=palloc0(sizeof(bool)*BgWriterShmem->num_requests);`
	`1157`	`+`
	`1158`	`+/*`
	`1159`	`+ * The basic idea here is that a request can be skipped if it's followed`
	`1160`	`+ * by a later, identical request. It might seem more sensible to work`
	`1161`	`+ * backwards from the end of the queue and check whether a request is`
	`1162`	`+ * preceded by an earlier, identical request, in the hopes of doing less`
	`1163`	`+ * copying. But that might change the semantics, if there's an`
	`1164`	`+ * intervening FORGET_RELATION_FSYNC or FORGET_DATABASE_FSYNC request, so`
	`1165`	`+ * we do it this way. It would be possible to be even smarter if we made`
	`1166`	`+ * the code below understand the specific semantics of such requests (it`
	`1167`	`+ * could blow away preceding entries that would end up being cancelled`
	`1168`	`+ * anyhow), but it's not clear that the extra complexity would buy us`
	`1169`	`+ * anything.`
	`1170`	`+ */`
	`1171`	`+for (n=0;n<BgWriterShmem->num_requests;++n)`
	`1172`	`+{`
	`1173`	`+BgWriterRequest*request;`
	`1174`	`+structBgWriterSlotMapping*slotmap;`
	`1175`	`+boolfound;`
	`1176`	`+`
	`1177`	`+request=&BgWriterShmem->requests[n];`
	`1178`	`+slotmap=hash_search(htab,request,HASH_ENTER,&found);`
	`1179`	`+if (found)`
	`1180`	`+{`
	`1181`	`+skip_slot[slotmap->slot]= true;`
	`1182`	`+++num_skipped;`
	`1183`	`+}`
	`1184`	`+slotmap->slot=n;`
	`1185`	`+}`
	`1186`	`+`
	`1187`	`+/* Done with the hash table. */`
	`1188`	`+hash_destroy(htab);`
	`1189`	`+`
	`1190`	`+/* If no duplicates, we're out of luck. */`
	`1191`	`+if (!num_skipped)`
	`1192`	`+{`
	`1193`	`+pfree(skip_slot);`
	`1194`	`+return false;`
	`1195`	`+}`
	`1196`	`+`
	`1197`	`+/* We found some duplicates; remove them. */`
	`1198`	`+for (n=0,preserve_count=0;n<BgWriterShmem->num_requests;++n)`
	`1199`	`+{`
	`1200`	`+if (skip_slot[n])`
	`1201`	`+continue;`
	`1202`	`+BgWriterShmem->requests[preserve_count++]=BgWriterShmem->requests[n];`
	`1203`	`+}`
	`1204`	`+ereport(DEBUG1,`
	`1205`	`+(errmsg("compacted fsync request queue from %d entries to %d entries",`
	`1206`	`+BgWriterShmem->num_requests,preserve_count)));`
	`1207`	`+BgWriterShmem->num_requests=preserve_count;`
	`1208`	`+`
	`1209`	`+/* Cleanup. */`
	`1210`	`+pfree(skip_slot);`
	`1211`	`+return true;`
	`1212`	`+}`
	`1213`	`+`
`1103`	`1214`	`/*`
`1104`	`1215`	`* AbsorbFsyncRequests`
`1105`	`1216`	`*Retrieve queued fsync requests and pass them to local smgr.`

`‎src/backend/storage/smgr/md.c‎`

Lines changed: 7 additions & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -33,7 +33,13 @@`
`33`	`33`	`/* interval for calling AbsorbFsyncRequests in mdsync */`
`34`	`34`	`#defineFSYNCS_PER_ABSORB10`
`35`	`35`
`36`		`-/* special values for the segno arg to RememberFsyncRequest */`
	`36`	`+/*`
	`37`	`+ * Special values for the segno arg to RememberFsyncRequest.`
	`38`	`+ *`
	`39`	`+ * Note that CompactBgwriterRequestQueue assumes that it's OK to remove an`
	`40`	`+ * fsync request from the queue if an identical, subsequent request is found.`
	`41`	`+ * See comments there before making changes here.`
	`42`	`+ */`
`37`	`43`	`#defineFORGET_RELATION_FSYNC(InvalidBlockNumber)`
`38`	`44`	`#defineFORGET_DATABASE_FSYNC(InvalidBlockNumber-1)`
`39`	`45`	`#defineUNLINK_RELATION_REQUEST (InvalidBlockNumber-2)`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit73a3b1d

File tree

2 files changed

2 files changed

`‎src/backend/postmaster/bgwriter.c‎`

`‎src/backend/storage/smgr/md.c‎`

0 commit comments