NotificationsYou must be signed in to change notification settings
Fork6
Star31

Commit0d01c5b

committed

Fix postmaster's handling of a startup-process crash.

Ordinarily, a failure (unexpected exit status) of the startup subprocessshould be considered fatal, so the postmaster should just close up shopand quit. However, if we sent the startup process a SIGQUIT or SIGKILLsignal, the failure is hardly "unexpected", and we should attempt restart;this is necessary for recovery from ordinary backend crashes in hot-standbyscenarios. I attempted to implement the latter rule with a two-line patchin commit442231d, but it now emerges thatthat patch was a few bricks shy of a load: it failed to distinguish thecase of a signaled startup process from the case where the new startupprocess crashes before reaching database consistency. That resulted ininfinitely respawning a new startup process only to have it crash again.To handle this properly, we really must track whether we have sent the*current* startup process a kill signal. Rather than add yet anotherad-hoc boolean to the postmaster's state, I chose to unify this with theexisting RecoveryError flag into an enum tracking the startup process'sstate. That seems more consistent with the postmaster's general statemachine design.Back-patch to 9.0, like the previous patch.

1 parentcf0c446 commit0d01c5bCopy full SHA for 0d01c5b

File tree

1 file changed

+37

-14

lines changed

src/backend/postmaster
- postmaster.c

1 file changed

+37

-14

lines changed

`‎src/backend/postmaster/postmaster.c`

Lines changed: 37 additions & 14 deletions

Original file line number	Diff line number	Diff line change
`@@ -249,6 +249,17 @@ static pid_t StartupPID = 0,`
`249`	`249`	`PgStatPID=0,`
`250`	`250`	`SysLoggerPID=0;`
`251`	`251`
	`252`	`+/* Startup process's status */`
	`253`	`+typedefenum`
	`254`	`+{`
	`255`	`+STARTUP_NOT_RUNNING,`
	`256`	`+STARTUP_RUNNING,`
	`257`	`+STARTUP_SIGNALED,/* we sent it a SIGQUIT or SIGKILL */`
	`258`	`+STARTUP_CRASHED`
	`259`	`+}StartupStatusEnum;`
	`260`	`+`
	`261`	`+staticStartupStatusEnumStartupStatus=STARTUP_NOT_RUNNING;`
	`262`	`+`
`252`	`263`	`/* Startup/shutdown state */`
`253`	`264`	`#defineNoShutdown0`
`254`	`265`	`#defineSmartShutdown1`
`@@ -258,7 +269,6 @@ static pid_t StartupPID = 0,`
`258`	`269`	`staticintShutdown=NoShutdown;`
`259`	`270`
`260`	`271`	`staticboolFatalError= false;/* T if recovering from backend crash */`
`261`		`-staticboolRecoveryError= false;/* T if WAL recovery failed */`
`262`	`272`
`263`	`273`	`/*`
`264`	`274`	`* We use a simple state machine to control startup, shutdown, and`
`@@ -301,8 +311,6 @@ static bool RecoveryError = false;/* T if WAL recovery failed */`
`301`	`311`	`* states, nor in PM_SHUTDOWN states (because we don't enter those states`
`302`	`312`	`* when trying to recover from a crash). It can be true in PM_STARTUP state,`
`303`	`313`	`* because we don't clear it until we've successfully started WAL redo.`
`304`		`- * Similarly, RecoveryError means that we have crashed during recovery, and`
`305`		`- * should not try to restart.`
`306`	`314`	`*/`
`307`	`315`	`typedefenum`
`308`	`316`	`{`
`@@ -1238,6 +1246,7 @@ PostmasterMain(int argc, char *argv[])`
`1238`	`1246`	`*/`
`1239`	`1247`	`StartupPID=StartupDataBase();`
`1240`	`1248`	`Assert(StartupPID!=0);`
	`1249`	`+StartupStatus=STARTUP_RUNNING;`
`1241`	`1250`	`pmState=PM_STARTUP;`
`1242`	`1251`
`1243`	`1252`	`/* Some workers may be scheduled to start now */`
`@@ -2583,6 +2592,7 @@ reaper(SIGNAL_ARGS)`
`2583`	`2592`	`if (Shutdown>NoShutdown&&`
`2584`	`2593`	`(EXIT_STATUS_0(exitstatus)\|\|EXIT_STATUS_1(exitstatus)))`
`2585`	`2594`	`{`
	`2595`	`+StartupStatus=STARTUP_NOT_RUNNING;`
`2586`	`2596`	`pmState=PM_WAIT_BACKENDS;`
`2587`	`2597`	`/* PostmasterStateMachine logic does the rest */`
`2588`	`2598`	`continue;`
`@@ -2605,16 +2615,18 @@ reaper(SIGNAL_ARGS)`
`2605`	`2615`	`/*`
`2606`	`2616`	`* After PM_STARTUP, any unexpected exit (including FATAL exit) of`
`2607`	`2617`	`* the startup process is catastrophic, so kill other children,`
`2608`		`- * and setRecoveryError so we don't try to reinitialize after`
`2609`		`- * they're gone. Exception: ifFatalError isalready set, that`
`2610`		`- *implies we previously sent the startup process a SIGQUIT, so`
	`2618`	`+ * and setStartupStatus so we don't try to reinitialize after`
	`2619`	`+ * they're gone. Exception: ifStartupStatus isSTARTUP_SIGNALED,`
	`2620`	`+ *then we previously sent the startup process a SIGQUIT; so`
`2611`	`2621`	`* that's probably the reason it died, and we do want to try to`
`2612`	`2622`	`* restart in that case.`
`2613`	`2623`	`*/`
`2614`	`2624`	`if (!EXIT_STATUS_0(exitstatus))`
`2615`	`2625`	`{`
`2616`		`-if (!FatalError)`
`2617`		`-RecoveryError= true;`
	`2626`	`+if (StartupStatus==STARTUP_SIGNALED)`
	`2627`	`+StartupStatus=STARTUP_NOT_RUNNING;`
	`2628`	`+else`
	`2629`	`+StartupStatus=STARTUP_CRASHED;`
`2618`	`2630`	`HandleChildCrash(pid,exitstatus,`
`2619`	`2631`	`_("startup process"));`
`2620`	`2632`	`continue;`
`@@ -2623,6 +2635,7 @@ reaper(SIGNAL_ARGS)`
`2623`	`2635`	`/*`
`2624`	`2636`	`* Startup succeeded, commence normal operations`
`2625`	`2637`	`*/`
	`2638`	`+StartupStatus=STARTUP_NOT_RUNNING;`
`2626`	`2639`	`FatalError= false;`
`2627`	`2640`	`Assert(AbortStartTime==0);`
`2628`	`2641`	`ReachedNormalRunning= true;`
`@@ -3170,14 +3183,18 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)`
`3170`	`3183`
`3171`	`3184`	`/* Take care of the startup process too */`
`3172`	`3185`	`if (pid==StartupPID)`
	`3186`	`+{`
`3173`	`3187`	`StartupPID=0;`
	`3188`	`+StartupStatus=STARTUP_CRASHED;`
	`3189`	`+}`
`3174`	`3190`	`elseif (StartupPID!=0&&take_action)`
`3175`	`3191`	`{`
`3176`	`3192`	`ereport(DEBUG2,`
`3177`	`3193`	`(errmsg_internal("sending %s to process %d",`
`3178`	`3194`	`(SendStop ?"SIGSTOP" :"SIGQUIT"),`
`3179`	`3195`	`(int)StartupPID)));`
`3180`	`3196`	`signal_child(StartupPID, (SendStop ?SIGSTOP :SIGQUIT));`
	`3197`	`+StartupStatus=STARTUP_SIGNALED;`
`3181`	`3198`	`}`
`3182`	`3199`
`3183`	`3200`	`/* Take care of the bgwriter too */`
`@@ -3569,13 +3586,14 @@ PostmasterStateMachine(void)`
`3569`	`3586`	`}`
`3570`	`3587`
`3571`	`3588`	`/*`
`3572`		`- * Ifrecoveryfailed, or the user does not want an automatic restart`
`3573`		`- * after backend crashes, wait for all non-syslogger children to exit, and`
`3574`		`- * then exit postmaster. We don't try to reinitialize whenrecovery fails,`
`3575`		`- * because more than likely it will just fail again and we will keep`
`3576`		`- * trying forever.`
	`3589`	`+ * Ifthe startup processfailed, or the user does not want an automatic`
	`3590`	`+ *restartafter backend crashes, wait for all non-syslogger children to`
	`3591`	`+ *exit, andthen exit postmaster.We don't try to reinitialize whenthe`
	`3592`	`+ *startup process fails,because more than likely it will just fail again`
	`3593`	`+ *and we will keeptrying forever.`
`3577`	`3594`	`*/`
`3578`		`-if (pmState==PM_NO_CHILDREN&& (RecoveryError\|\| !restart_after_crash))`
	`3595`	`+if (pmState==PM_NO_CHILDREN&&`
	`3596`	`+(StartupStatus==STARTUP_CRASHED\|\| !restart_after_crash))`
`3579`	`3597`	`ExitPostmaster(1);`
`3580`	`3598`
`3581`	`3599`	`/*`
`@@ -3595,6 +3613,7 @@ PostmasterStateMachine(void)`
`3595`	`3613`
`3596`	`3614`	`StartupPID=StartupDataBase();`
`3597`	`3615`	`Assert(StartupPID!=0);`
	`3616`	`+StartupStatus=STARTUP_RUNNING;`
`3598`	`3617`	`pmState=PM_STARTUP;`
`3599`	`3618`	`/* crash recovery started, reset SIGKILL flag */`
`3600`	`3619`	`AbortStartTime=0;`
`@@ -3726,7 +3745,11 @@ TerminateChildren(int signal)`
`3726`	`3745`	`{`
`3727`	`3746`	`SignalChildren(signal);`
`3728`	`3747`	`if (StartupPID!=0)`
	`3748`	`+{`
`3729`	`3749`	`signal_child(StartupPID,signal);`
	`3750`	`+if (signal==SIGQUIT\|\|signal==SIGKILL)`
	`3751`	`+StartupStatus=STARTUP_SIGNALED;`
	`3752`	`+}`
`3730`	`3753`	`if (BgWriterPID!=0)`
`3731`	`3754`	`signal_child(BgWriterPID,signal);`
`3732`	`3755`	`if (CheckpointerPID!=0)`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit0d01c5b

File tree

1 file changed

1 file changed

`‎src/backend/postmaster/postmaster.c`

0 commit comments