Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit9c39d7a

Browse files
committed
Fix postmaster's handling of a startup-process crash.
Ordinarily, a failure (unexpected exit status) of the startup subprocessshould be considered fatal, so the postmaster should just close up shopand quit. However, if we sent the startup process a SIGQUIT or SIGKILLsignal, the failure is hardly "unexpected", and we should attempt restart;this is necessary for recovery from ordinary backend crashes in hot-standbyscenarios. I attempted to implement the latter rule with a two-line patchin commit442231d, but it now emerges thatthat patch was a few bricks shy of a load: it failed to distinguish thecase of a signaled startup process from the case where the new startupprocess crashes before reaching database consistency. That resulted ininfinitely respawning a new startup process only to have it crash again.To handle this properly, we really must track whether we have sent the*current* startup process a kill signal. Rather than add yet anotherad-hoc boolean to the postmaster's state, I chose to unify this with theexisting RecoveryError flag into an enum tracking the startup process'sstate. That seems more consistent with the postmaster's general statemachine design.Back-patch to 9.0, like the previous patch.
1 parent6327730 commit9c39d7a

File tree

1 file changed

+33
-14
lines changed

1 file changed

+33
-14
lines changed

‎src/backend/postmaster/postmaster.c

Lines changed: 33 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -275,6 +275,17 @@ static pid_t StartupPID = 0,
275275
PgStatPID=0,
276276
SysLoggerPID=0;
277277

278+
/* Startup process's status */
279+
typedefenum
280+
{
281+
STARTUP_NOT_RUNNING,
282+
STARTUP_RUNNING,
283+
STARTUP_SIGNALED,/* we sent it a SIGQUIT or SIGKILL */
284+
STARTUP_CRASHED
285+
}StartupStatusEnum;
286+
287+
staticStartupStatusEnumStartupStatus=STARTUP_NOT_RUNNING;
288+
278289
/* Startup/shutdown state */
279290
#defineNoShutdown0
280291
#defineSmartShutdown1
@@ -283,7 +294,6 @@ static pid_t StartupPID = 0,
283294
staticintShutdown=NoShutdown;
284295

285296
staticboolFatalError= false;/* T if recovering from backend crash */
286-
staticboolRecoveryError= false;/* T if WAL recovery failed */
287297

288298
/*
289299
* We use a simple state machine to control startup, shutdown, and
@@ -326,8 +336,6 @@ static bool RecoveryError = false;/* T if WAL recovery failed */
326336
* states, nor in PM_SHUTDOWN states (because we don't enter those states
327337
* when trying to recover from a crash). It can be true in PM_STARTUP state,
328338
* because we don't clear it until we've successfully started WAL redo.
329-
* Similarly, RecoveryError means that we have crashed during recovery, and
330-
* should not try to restart.
331339
*/
332340
typedefenum
333341
{
@@ -1266,6 +1274,7 @@ PostmasterMain(int argc, char *argv[])
12661274
*/
12671275
StartupPID=StartupDataBase();
12681276
Assert(StartupPID!=0);
1277+
StartupStatus=STARTUP_RUNNING;
12691278
pmState=PM_STARTUP;
12701279

12711280
/* Some workers may be scheduled to start now */
@@ -2565,6 +2574,7 @@ reaper(SIGNAL_ARGS)
25652574
if (Shutdown>NoShutdown&&
25662575
(EXIT_STATUS_0(exitstatus)||EXIT_STATUS_1(exitstatus)))
25672576
{
2577+
StartupStatus=STARTUP_NOT_RUNNING;
25682578
pmState=PM_WAIT_BACKENDS;
25692579
/* PostmasterStateMachine logic does the rest */
25702580
continue;
@@ -2587,16 +2597,18 @@ reaper(SIGNAL_ARGS)
25872597
/*
25882598
* After PM_STARTUP, any unexpected exit (including FATAL exit) of
25892599
* the startup process is catastrophic, so kill other children,
2590-
* and setRecoveryError so we don't try to reinitialize after
2591-
* they're gone. Exception: ifFatalError isalready set, that
2592-
*implies we previously sent the startup process a SIGQUIT, so
2600+
* and setStartupStatus so we don't try to reinitialize after
2601+
* they're gone. Exception: ifStartupStatus isSTARTUP_SIGNALED,
2602+
*then we previously sent the startup process a SIGQUIT; so
25932603
* that's probably the reason it died, and we do want to try to
25942604
* restart in that case.
25952605
*/
25962606
if (!EXIT_STATUS_0(exitstatus))
25972607
{
2598-
if (!FatalError)
2599-
RecoveryError= true;
2608+
if (StartupStatus==STARTUP_SIGNALED)
2609+
StartupStatus=STARTUP_NOT_RUNNING;
2610+
else
2611+
StartupStatus=STARTUP_CRASHED;
26002612
HandleChildCrash(pid,exitstatus,
26012613
_("startup process"));
26022614
continue;
@@ -2605,6 +2617,7 @@ reaper(SIGNAL_ARGS)
26052617
/*
26062618
* Startup succeeded, commence normal operations
26072619
*/
2620+
StartupStatus=STARTUP_NOT_RUNNING;
26082621
FatalError= false;
26092622
ReachedNormalRunning= true;
26102623
pmState=PM_RUN;
@@ -3115,14 +3128,18 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
31153128

31163129
/* Take care of the startup process too */
31173130
if (pid==StartupPID)
3131+
{
31183132
StartupPID=0;
3133+
StartupStatus=STARTUP_CRASHED;
3134+
}
31193135
elseif (StartupPID!=0&& !FatalError)
31203136
{
31213137
ereport(DEBUG2,
31223138
(errmsg_internal("sending %s to process %d",
31233139
(SendStop ?"SIGSTOP" :"SIGQUIT"),
31243140
(int)StartupPID)));
31253141
signal_child(StartupPID, (SendStop ?SIGSTOP :SIGQUIT));
3142+
StartupStatus=STARTUP_SIGNALED;
31263143
}
31273144

31283145
/* Take care of the bgwriter too */
@@ -3502,13 +3519,14 @@ PostmasterStateMachine(void)
35023519
}
35033520

35043521
/*
3505-
* Ifrecoveryfailed, or the user does not want an automatic restart
3506-
* after backend crashes, wait for all non-syslogger children to exit, and
3507-
* then exit postmaster. We don't try to reinitialize whenrecovery fails,
3508-
* because more than likely it will just fail again and we will keep
3509-
* trying forever.
3522+
* Ifthe startup processfailed, or the user does not want an automatic
3523+
*restartafter backend crashes, wait for all non-syslogger children to
3524+
*exit, andthen exit postmaster.We don't try to reinitialize whenthe
3525+
*startup process fails,because more than likely it will just fail again
3526+
*and we will keeptrying forever.
35103527
*/
3511-
if (pmState==PM_NO_CHILDREN&& (RecoveryError|| !restart_after_crash))
3528+
if (pmState==PM_NO_CHILDREN&&
3529+
(StartupStatus==STARTUP_CRASHED|| !restart_after_crash))
35123530
ExitPostmaster(1);
35133531

35143532
/*
@@ -3525,6 +3543,7 @@ PostmasterStateMachine(void)
35253543

35263544
StartupPID=StartupDataBase();
35273545
Assert(StartupPID!=0);
3546+
StartupStatus=STARTUP_RUNNING;
35283547
pmState=PM_STARTUP;
35293548
}
35303549
}

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp