Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit0dfe3d0

Browse files
committed
Make checkpoint requests more robust.
Commit6f6a6d8 introduced a delay of up to 2 seconds if we're tryingto request a checkpoint but the checkpointer hasn't started yet (or,much less likely, our kill() call fails). However buildfarm experienceshows that that's not quite enough for slow or heavily-loaded machines.There's no good reason to assume that the checkpointer won't starteventually, so we may as well make the timeout much longer, say 60 sec.However, if the caller didn't say CHECKPOINT_WAIT, it seems like a badidea to be waiting at all, much less for as long as 60 sec. We canremove the need for that, and make this whole thing more robust, byadjusting the code so that the existence of a pending checkpointrequest is clear from the contents of shared memory, and making surethat the checkpointer process will notice it at startup even if it didnot get a signal. In this way there's no need for a non-CHECKPOINT_WAITcall to wait at all; if it can't send the signal, it can nonethelessassume that the checkpointer will eventually service the request.A potential downside of this change is that "kill -INT" on the checkpointerprocess is no longer enough to trigger a checkpoint, should anyone berelying on something so hacky. But there's no obvious reason to do itlike that rather than issuing a plain old CHECKPOINT command, so we'llassume that nobody is. There doesn't seem to be a way to preserve thisundocumented quasi-feature without introducing race conditions.Since a principal reason for messing with this is to prevent intermittentbuildfarm failures, back-patch to all supported branches.Discussion:https://postgr.es/m/27830.1552752475@sss.pgh.pa.us
1 parent28988a8 commit0dfe3d0

File tree

2 files changed

+36
-27
lines changed

2 files changed

+36
-27
lines changed

‎src/backend/postmaster/checkpointer.c

Lines changed: 33 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,6 @@ doubleCheckPointCompletionTarget = 0.5;
153153
* Flags set by interrupt handlers for later service in the main loop.
154154
*/
155155
staticvolatilesig_atomic_tgot_SIGHUP= false;
156-
staticvolatilesig_atomic_tcheckpoint_requested= false;
157156
staticvolatilesig_atomic_tshutdown_requested= false;
158157

159158
/*
@@ -370,12 +369,6 @@ CheckpointerMain(void)
370369
*/
371370
UpdateSharedMemoryConfig();
372371
}
373-
if (checkpoint_requested)
374-
{
375-
checkpoint_requested= false;
376-
do_checkpoint= true;
377-
BgWriterStats.m_requested_checkpoints++;
378-
}
379372
if (shutdown_requested)
380373
{
381374
/*
@@ -389,6 +382,17 @@ CheckpointerMain(void)
389382
proc_exit(0);/* done */
390383
}
391384

385+
/*
386+
* Detect a pending checkpoint request by checking whether the flags
387+
* word in shared memory is nonzero. We shouldn't need to acquire the
388+
* ckpt_lck for this.
389+
*/
390+
if (((volatileCheckpointerShmemStruct*)CheckpointerShmem)->ckpt_flags)
391+
{
392+
do_checkpoint= true;
393+
BgWriterStats.m_requested_checkpoints++;
394+
}
395+
392396
/*
393397
* Force a checkpoint if too much time has elapsed since the last one.
394398
* Note that we count a timed checkpoint in stats only when this
@@ -630,17 +634,14 @@ CheckArchiveTimeout(void)
630634
staticbool
631635
ImmediateCheckpointRequested(void)
632636
{
633-
if (checkpoint_requested)
634-
{
635-
volatileCheckpointerShmemStruct*cps=CheckpointerShmem;
637+
volatileCheckpointerShmemStruct*cps=CheckpointerShmem;
636638

637-
/*
638-
* We don't need to acquire the ckpt_lck in this case because we're
639-
* only looking at a single flag bit.
640-
*/
641-
if (cps->ckpt_flags&CHECKPOINT_IMMEDIATE)
642-
return true;
643-
}
639+
/*
640+
* We don't need to acquire the ckpt_lck in this case because we're only
641+
* looking at a single flag bit.
642+
*/
643+
if (cps->ckpt_flags&CHECKPOINT_IMMEDIATE)
644+
return true;
644645
return false;
645646
}
646647

@@ -843,7 +844,10 @@ ReqCheckpointHandler(SIGNAL_ARGS)
843844
{
844845
intsave_errno=errno;
845846

846-
checkpoint_requested= true;
847+
/*
848+
* The signalling process should have set ckpt_flags nonzero, so all we
849+
* need do is ensure that our main loop gets kicked out of any wait.
850+
*/
847851
SetLatch(MyLatch);
848852

849853
errno=save_errno;
@@ -984,31 +988,35 @@ RequestCheckpoint(int flags)
984988

985989
old_failed=CheckpointerShmem->ckpt_failed;
986990
old_started=CheckpointerShmem->ckpt_started;
987-
CheckpointerShmem->ckpt_flags |=flags;
991+
CheckpointerShmem->ckpt_flags |=(flags |CHECKPOINT_REQUESTED);
988992

989993
SpinLockRelease(&CheckpointerShmem->ckpt_lck);
990994

991995
/*
992996
* Send signal to request checkpoint. It's possible that the checkpointer
993997
* hasn't started yet, or is in process of restarting, so we will retry a
994-
* few times if needed. Also, if not told to wait for the checkpoint to
995-
* occur, we consider failure to send the signal to be nonfatal and merely
996-
* LOG it.
998+
* few times if needed. (Actually, more than a few times, since on slow
999+
* or overloaded buildfarm machines, it's been observed that the
1000+
* checkpointer can take several seconds to start.) However, if not told
1001+
* to wait for the checkpoint to occur, we consider failure to send the
1002+
* signal to be nonfatal and merely LOG it. The checkpointer should see
1003+
* the request when it does start, with or without getting a signal.
9971004
*/
1005+
#defineMAX_SIGNAL_TRIES 600/* max wait 60.0 sec */
9981006
for (ntries=0;;ntries++)
9991007
{
10001008
if (CheckpointerShmem->checkpointer_pid==0)
10011009
{
1002-
if (ntries >=20)/* max wait 2.0 sec */
1010+
if (ntries >=MAX_SIGNAL_TRIES|| !(flags&CHECKPOINT_WAIT))
10031011
{
10041012
elog((flags&CHECKPOINT_WAIT) ?ERROR :LOG,
1005-
"could notrequestcheckpoint becausecheckpointer not running");
1013+
"could notsignal forcheckpoint:checkpointer is not running");
10061014
break;
10071015
}
10081016
}
10091017
elseif (kill(CheckpointerShmem->checkpointer_pid,SIGINT)!=0)
10101018
{
1011-
if (ntries >=20)/* max wait 2.0 sec */
1019+
if (ntries >=MAX_SIGNAL_TRIES|| !(flags&CHECKPOINT_WAIT))
10121020
{
10131021
elog((flags&CHECKPOINT_WAIT) ?ERROR :LOG,
10141022
"could not signal for checkpoint: %m");

‎src/include/access/xlog.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -215,9 +215,10 @@ extern bool XLOG_DEBUG;
215215
* belonging to unlogged tables */
216216
/* These are important to RequestCheckpoint */
217217
#defineCHECKPOINT_WAIT0x0020/* Wait for completion */
218+
#defineCHECKPOINT_REQUESTED0x0040/* Checkpoint request has been made */
218219
/* These indicate the cause of a checkpoint request */
219-
#defineCHECKPOINT_CAUSE_XLOG0x0040/* XLOG consumption */
220-
#defineCHECKPOINT_CAUSE_TIME0x0080/* Elapsed time */
220+
#defineCHECKPOINT_CAUSE_XLOG0x0080/* XLOG consumption */
221+
#defineCHECKPOINT_CAUSE_TIME0x0100/* Elapsed time */
221222

222223
/*
223224
* Flag bits for the record being inserted, set using XLogSetRecordFlags().

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp