Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit45811be

Browse files
committed
Fix postmaster's handling of a startup-process crash.
Ordinarily, a failure (unexpected exit status) of the startup subprocessshould be considered fatal, so the postmaster should just close up shopand quit. However, if we sent the startup process a SIGQUIT or SIGKILLsignal, the failure is hardly "unexpected", and we should attempt restart;this is necessary for recovery from ordinary backend crashes in hot-standbyscenarios. I attempted to implement the latter rule with a two-line patchin commit442231d, but it now emerges thatthat patch was a few bricks shy of a load: it failed to distinguish thecase of a signaled startup process from the case where the new startupprocess crashes before reaching database consistency. That resulted ininfinitely respawning a new startup process only to have it crash again.To handle this properly, we really must track whether we have sent the*current* startup process a kill signal. Rather than add yet anotherad-hoc boolean to the postmaster's state, I chose to unify this with theexisting RecoveryError flag into an enum tracking the startup process'sstate. That seems more consistent with the postmaster's general statemachine design.Back-patch to 9.0, like the previous patch.
1 parent6ba365a commit45811be

File tree

1 file changed

+40
-16
lines changed

1 file changed

+40
-16
lines changed

‎src/backend/postmaster/postmaster.c

Lines changed: 40 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,17 @@ static pid_t StartupPID = 0,
249249
PgStatPID=0,
250250
SysLoggerPID=0;
251251

252+
/* Startup process's status */
253+
typedefenum
254+
{
255+
STARTUP_NOT_RUNNING,
256+
STARTUP_RUNNING,
257+
STARTUP_SIGNALED,/* we sent it a SIGQUIT or SIGKILL */
258+
STARTUP_CRASHED
259+
}StartupStatusEnum;
260+
261+
staticStartupStatusEnumStartupStatus=STARTUP_NOT_RUNNING;
262+
252263
/* Startup/shutdown state */
253264
#defineNoShutdown0
254265
#defineSmartShutdown1
@@ -258,7 +269,6 @@ static pid_t StartupPID = 0,
258269
staticintShutdown=NoShutdown;
259270

260271
staticboolFatalError= false;/* T if recovering from backend crash */
261-
staticboolRecoveryError= false;/* T if WAL recovery failed */
262272

263273
/*
264274
* We use a simple state machine to control startup, shutdown, and
@@ -301,8 +311,6 @@ static bool RecoveryError = false;/* T if WAL recovery failed */
301311
* states, nor in PM_SHUTDOWN states (because we don't enter those states
302312
* when trying to recover from a crash). It can be true in PM_STARTUP state,
303313
* because we don't clear it until we've successfully started WAL redo.
304-
* Similarly, RecoveryError means that we have crashed during recovery, and
305-
* should not try to restart.
306314
*/
307315
typedefenum
308316
{
@@ -1246,6 +1254,7 @@ PostmasterMain(int argc, char *argv[])
12461254
*/
12471255
StartupPID=StartupDataBase();
12481256
Assert(StartupPID!=0);
1257+
StartupStatus=STARTUP_RUNNING;
12491258
pmState=PM_STARTUP;
12501259

12511260
/* Some workers may be scheduled to start now */
@@ -1666,7 +1675,7 @@ ServerLoop(void)
16661675

16671676
/* If we have lost the archiver, try to start a new one. */
16681677
if (PgArchPID==0&&PgArchStartupAllowed())
1669-
PgArchPID=pgarch_start();
1678+
PgArchPID=pgarch_start();
16701679

16711680
/* If we need to signal the autovacuum launcher, do so now */
16721681
if (avlauncher_needs_signal)
@@ -2591,6 +2600,7 @@ reaper(SIGNAL_ARGS)
25912600
if (Shutdown>NoShutdown&&
25922601
(EXIT_STATUS_0(exitstatus)||EXIT_STATUS_1(exitstatus)))
25932602
{
2603+
StartupStatus=STARTUP_NOT_RUNNING;
25942604
pmState=PM_WAIT_BACKENDS;
25952605
/* PostmasterStateMachine logic does the rest */
25962606
continue;
@@ -2600,6 +2610,7 @@ reaper(SIGNAL_ARGS)
26002610
{
26012611
ereport(LOG,
26022612
(errmsg("shutdown at recovery target")));
2613+
StartupStatus=STARTUP_NOT_RUNNING;
26032614
Shutdown=SmartShutdown;
26042615
TerminateChildren(SIGTERM);
26052616
pmState=PM_WAIT_BACKENDS;
@@ -2624,16 +2635,18 @@ reaper(SIGNAL_ARGS)
26242635
/*
26252636
* After PM_STARTUP, any unexpected exit (including FATAL exit) of
26262637
* the startup process is catastrophic, so kill other children,
2627-
* and setRecoveryError so we don't try to reinitialize after
2628-
* they're gone. Exception: ifFatalError isalready set, that
2629-
*implies we previously sent the startup process a SIGQUIT, so
2638+
* and setStartupStatus so we don't try to reinitialize after
2639+
* they're gone. Exception: ifStartupStatus isSTARTUP_SIGNALED,
2640+
*then we previously sent the startup process a SIGQUIT; so
26302641
* that's probably the reason it died, and we do want to try to
26312642
* restart in that case.
26322643
*/
26332644
if (!EXIT_STATUS_0(exitstatus))
26342645
{
2635-
if (!FatalError)
2636-
RecoveryError= true;
2646+
if (StartupStatus==STARTUP_SIGNALED)
2647+
StartupStatus=STARTUP_NOT_RUNNING;
2648+
else
2649+
StartupStatus=STARTUP_CRASHED;
26372650
HandleChildCrash(pid,exitstatus,
26382651
_("startup process"));
26392652
continue;
@@ -2642,6 +2655,7 @@ reaper(SIGNAL_ARGS)
26422655
/*
26432656
* Startup succeeded, commence normal operations
26442657
*/
2658+
StartupStatus=STARTUP_NOT_RUNNING;
26452659
FatalError= false;
26462660
Assert(AbortStartTime==0);
26472661
ReachedNormalRunning= true;
@@ -2962,7 +2976,7 @@ CleanupBackgroundWorker(int pid,
29622976
ReportBackgroundWorkerPID(rw);/* report child death */
29632977

29642978
LogChildExit(EXIT_STATUS_0(exitstatus) ?DEBUG1 :LOG,
2965-
namebuf,pid,exitstatus);
2979+
namebuf,pid,exitstatus);
29662980

29672981
return true;
29682982
}
@@ -3190,14 +3204,18 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
31903204

31913205
/* Take care of the startup process too */
31923206
if (pid==StartupPID)
3207+
{
31933208
StartupPID=0;
3209+
StartupStatus=STARTUP_CRASHED;
3210+
}
31943211
elseif (StartupPID!=0&&take_action)
31953212
{
31963213
ereport(DEBUG2,
31973214
(errmsg_internal("sending %s to process %d",
31983215
(SendStop ?"SIGSTOP" :"SIGQUIT"),
31993216
(int)StartupPID)));
32003217
signal_child(StartupPID, (SendStop ?SIGSTOP :SIGQUIT));
3218+
StartupStatus=STARTUP_SIGNALED;
32013219
}
32023220

32033221
/* Take care of the bgwriter too */
@@ -3589,13 +3607,14 @@ PostmasterStateMachine(void)
35893607
}
35903608

35913609
/*
3592-
* Ifrecoveryfailed, or the user does not want an automatic restart
3593-
* after backend crashes, wait for all non-syslogger children to exit, and
3594-
* then exit postmaster. We don't try to reinitialize whenrecovery fails,
3595-
* because more than likely it will just fail again and we will keep
3596-
* trying forever.
3610+
* Ifthe startup processfailed, or the user does not want an automatic
3611+
*restartafter backend crashes, wait for all non-syslogger children to
3612+
*exit, andthen exit postmaster.We don't try to reinitialize whenthe
3613+
*startup process fails,because more than likely it will just fail again
3614+
*and we will keeptrying forever.
35973615
*/
3598-
if (pmState==PM_NO_CHILDREN&& (RecoveryError|| !restart_after_crash))
3616+
if (pmState==PM_NO_CHILDREN&&
3617+
(StartupStatus==STARTUP_CRASHED|| !restart_after_crash))
35993618
ExitPostmaster(1);
36003619

36013620
/*
@@ -3615,6 +3634,7 @@ PostmasterStateMachine(void)
36153634

36163635
StartupPID=StartupDataBase();
36173636
Assert(StartupPID!=0);
3637+
StartupStatus=STARTUP_RUNNING;
36183638
pmState=PM_STARTUP;
36193639
/* crash recovery started, reset SIGKILL flag */
36203640
AbortStartTime=0;
@@ -3746,7 +3766,11 @@ TerminateChildren(int signal)
37463766
{
37473767
SignalChildren(signal);
37483768
if (StartupPID!=0)
3769+
{
37493770
signal_child(StartupPID,signal);
3771+
if (signal==SIGQUIT||signal==SIGKILL)
3772+
StartupStatus=STARTUP_SIGNALED;
3773+
}
37503774
if (BgWriterPID!=0)
37513775
signal_child(BgWriterPID,signal);
37523776
if (CheckpointerPID!=0)

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp