NotificationsYou must be signed in to change notification settings
Fork6
Star31

Commit0dfe3d0

committed

Make checkpoint requests more robust.

Commit6f6a6d8 introduced a delay of up to 2 seconds if we're tryingto request a checkpoint but the checkpointer hasn't started yet (or,much less likely, our kill() call fails). However buildfarm experienceshows that that's not quite enough for slow or heavily-loaded machines.There's no good reason to assume that the checkpointer won't starteventually, so we may as well make the timeout much longer, say 60 sec.However, if the caller didn't say CHECKPOINT_WAIT, it seems like a badidea to be waiting at all, much less for as long as 60 sec. We canremove the need for that, and make this whole thing more robust, byadjusting the code so that the existence of a pending checkpointrequest is clear from the contents of shared memory, and making surethat the checkpointer process will notice it at startup even if it didnot get a signal. In this way there's no need for a non-CHECKPOINT_WAITcall to wait at all; if it can't send the signal, it can nonethelessassume that the checkpointer will eventually service the request.A potential downside of this change is that "kill -INT" on the checkpointerprocess is no longer enough to trigger a checkpoint, should anyone berelying on something so hacky. But there's no obvious reason to do itlike that rather than issuing a plain old CHECKPOINT command, so we'llassume that nobody is. There doesn't seem to be a way to preserve thisundocumented quasi-feature without introducing race conditions.Since a principal reason for messing with this is to prevent intermittentbuildfarm failures, back-patch to all supported branches.Discussion:https://postgr.es/m/27830.1552752475@sss.pgh.pa.us

1 parent28988a8 commit0dfe3d0Copy full SHA for 0dfe3d0

File tree

2 files changed

+36

-27

lines changed

src
- backend/postmaster
  - checkpointer.c
- include/access
  - xlog.h

2 files changed

+36

-27

lines changed

`‎src/backend/postmaster/checkpointer.c‎`

Lines changed: 33 additions & 25 deletions

Original file line number	Diff line number	Diff line change
`@@ -153,7 +153,6 @@ doubleCheckPointCompletionTarget = 0.5;`
`153`	`153`	`* Flags set by interrupt handlers for later service in the main loop.`
`154`	`154`	`*/`
`155`	`155`	`staticvolatilesig_atomic_tgot_SIGHUP= false;`
`156`		`-staticvolatilesig_atomic_tcheckpoint_requested= false;`
`157`	`156`	`staticvolatilesig_atomic_tshutdown_requested= false;`
`158`	`157`
`159`	`158`	`/*`
`@@ -370,12 +369,6 @@ CheckpointerMain(void)`
`370`	`369`	`*/`
`371`	`370`	`UpdateSharedMemoryConfig();`
`372`	`371`	`}`
`373`		`-if (checkpoint_requested)`
`374`		`-{`
`375`		`-checkpoint_requested= false;`
`376`		`-do_checkpoint= true;`
`377`		`-BgWriterStats.m_requested_checkpoints++;`
`378`		`-}`
`379`	`372`	`if (shutdown_requested)`
`380`	`373`	`{`
`381`	`374`	`/*`
`@@ -389,6 +382,17 @@ CheckpointerMain(void)`
`389`	`382`	`proc_exit(0);/* done */`
`390`	`383`	`}`
`391`	`384`
	`385`	`+/*`
	`386`	`+ * Detect a pending checkpoint request by checking whether the flags`
	`387`	`+ * word in shared memory is nonzero. We shouldn't need to acquire the`
	`388`	`+ * ckpt_lck for this.`
	`389`	`+ */`
	`390`	`+if (((volatileCheckpointerShmemStruct*)CheckpointerShmem)->ckpt_flags)`
	`391`	`+{`
	`392`	`+do_checkpoint= true;`
	`393`	`+BgWriterStats.m_requested_checkpoints++;`
	`394`	`+}`
	`395`	`+`
`392`	`396`	`/*`
`393`	`397`	`* Force a checkpoint if too much time has elapsed since the last one.`
`394`	`398`	`* Note that we count a timed checkpoint in stats only when this`
`@@ -630,17 +634,14 @@ CheckArchiveTimeout(void)`
`630`	`634`	`staticbool`
`631`	`635`	`ImmediateCheckpointRequested(void)`
`632`	`636`	`{`
`633`		`-if (checkpoint_requested)`
`634`		`-{`
`635`		`-volatileCheckpointerShmemStruct*cps=CheckpointerShmem;`
	`637`	`+volatileCheckpointerShmemStruct*cps=CheckpointerShmem;`
`636`	`638`
`637`		`-/*`
`638`		`- * We don't need to acquire the ckpt_lck in this case because we're`
`639`		`- * only looking at a single flag bit.`
`640`		`- */`
`641`		`-if (cps->ckpt_flags&CHECKPOINT_IMMEDIATE)`
`642`		`-return true;`
`643`		`-}`
	`639`	`+/*`
	`640`	`+ * We don't need to acquire the ckpt_lck in this case because we're only`
	`641`	`+ * looking at a single flag bit.`
	`642`	`+ */`
	`643`	`+if (cps->ckpt_flags&CHECKPOINT_IMMEDIATE)`
	`644`	`+return true;`
`644`	`645`	`return false;`
`645`	`646`	`}`
`646`	`647`
`@@ -843,7 +844,10 @@ ReqCheckpointHandler(SIGNAL_ARGS)`
`843`	`844`	`{`
`844`	`845`	`intsave_errno=errno;`
`845`	`846`
`846`		`-checkpoint_requested= true;`
	`847`	`+/*`
	`848`	`+ * The signalling process should have set ckpt_flags nonzero, so all we`
	`849`	`+ * need do is ensure that our main loop gets kicked out of any wait.`
	`850`	`+ */`
`847`	`851`	`SetLatch(MyLatch);`
`848`	`852`
`849`	`853`	`errno=save_errno;`
`@@ -984,31 +988,35 @@ RequestCheckpoint(int flags)`
`984`	`988`
`985`	`989`	`old_failed=CheckpointerShmem->ckpt_failed;`
`986`	`990`	`old_started=CheckpointerShmem->ckpt_started;`
`987`		`-CheckpointerShmem->ckpt_flags \|=flags;`
	`991`	`+CheckpointerShmem->ckpt_flags \|=(flags \|CHECKPOINT_REQUESTED);`
`988`	`992`
`989`	`993`	`SpinLockRelease(&CheckpointerShmem->ckpt_lck);`
`990`	`994`
`991`	`995`	`/*`
`992`	`996`	`* Send signal to request checkpoint. It's possible that the checkpointer`
`993`	`997`	`* hasn't started yet, or is in process of restarting, so we will retry a`
`994`		`- * few times if needed. Also, if not told to wait for the checkpoint to`
`995`		`- * occur, we consider failure to send the signal to be nonfatal and merely`
`996`		`- * LOG it.`
	`998`	`+ * few times if needed. (Actually, more than a few times, since on slow`
	`999`	`+ * or overloaded buildfarm machines, it's been observed that the`
	`1000`	`+ * checkpointer can take several seconds to start.) However, if not told`
	`1001`	`+ * to wait for the checkpoint to occur, we consider failure to send the`
	`1002`	`+ * signal to be nonfatal and merely LOG it. The checkpointer should see`
	`1003`	`+ * the request when it does start, with or without getting a signal.`
`997`	`1004`	`*/`
	`1005`	`+#defineMAX_SIGNAL_TRIES 600/* max wait 60.0 sec */`
`998`	`1006`	`for (ntries=0;;ntries++)`
`999`	`1007`	`{`
`1000`	`1008`	`if (CheckpointerShmem->checkpointer_pid==0)`
`1001`	`1009`	`{`
`1002`		`-if (ntries >=20)/* max wait 2.0 sec */`
	`1010`	`+if (ntries >=MAX_SIGNAL_TRIES\|\| !(flags&CHECKPOINT_WAIT))`
`1003`	`1011`	`{`
`1004`	`1012`	`elog((flags&CHECKPOINT_WAIT) ?ERROR :LOG,`
`1005`		`-"could notrequestcheckpoint becausecheckpointer not running");`
	`1013`	`+"could notsignal forcheckpoint:checkpointer is not running");`
`1006`	`1014`	`break;`
`1007`	`1015`	`}`
`1008`	`1016`	`}`
`1009`	`1017`	`elseif (kill(CheckpointerShmem->checkpointer_pid,SIGINT)!=0)`
`1010`	`1018`	`{`
`1011`		`-if (ntries >=20)/* max wait 2.0 sec */`
	`1019`	`+if (ntries >=MAX_SIGNAL_TRIES\|\| !(flags&CHECKPOINT_WAIT))`
`1012`	`1020`	`{`
`1013`	`1021`	`elog((flags&CHECKPOINT_WAIT) ?ERROR :LOG,`
`1014`	`1022`	`"could not signal for checkpoint: %m");`

`‎src/include/access/xlog.h‎`

Lines changed: 3 additions & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -215,9 +215,10 @@ extern bool XLOG_DEBUG;`
`215`	`215`	`* belonging to unlogged tables */`
`216`	`216`	`/* These are important to RequestCheckpoint */`
`217`	`217`	`#defineCHECKPOINT_WAIT0x0020/* Wait for completion */`
	`218`	`+#defineCHECKPOINT_REQUESTED0x0040/* Checkpoint request has been made */`
`218`	`219`	`/* These indicate the cause of a checkpoint request */`
`219`		`-#defineCHECKPOINT_CAUSE_XLOG0x0040/* XLOG consumption */`
`220`		`-#defineCHECKPOINT_CAUSE_TIME0x0080/* Elapsed time */`
	`220`	`+#defineCHECKPOINT_CAUSE_XLOG0x0080/* XLOG consumption */`
	`221`	`+#defineCHECKPOINT_CAUSE_TIME0x0100/* Elapsed time */`
`221`	`222`
`222`	`223`	`/*`
`223`	`224`	`* Flag bits for the record being inserted, set using XLogSetRecordFlags().`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit0dfe3d0

File tree

2 files changed

2 files changed

`‎src/backend/postmaster/checkpointer.c‎`

`‎src/include/access/xlog.h‎`

0 commit comments