Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit61c20a5

Browse files
committed
Make pg_ctl stop/restart/promote recheck postmaster aliveness.
"pg_ctl stop/restart" checked that the postmaster PID is valid justonce, as a side-effect of sending the stop signal, and then wouldwait-till-timeout for the postmaster.pid file to go away. Thisneglects the case wherein the postmaster dies uncleanly after wesignal it. Similarly, once "pg_ctl promote" has sent the signal,it'd wait for the corresponding on-disk state change to occureven if the postmaster dies.I'm not sure how we've managed not to notice this problem, but itseems to explain slow execution of the 017_shm.pl test script on AIXsince commit4fdbf9a, which added a speculative "pg_ctl stop" withthe idea of making real sure that the postmaster isn't there. In thetest steps that kill-9 and then restart the postmaster, it's possibleto get past the initial signal attempt before kill() stops workingfor the doomed postmaster. If that happens, pg_ctl waited tillPGCTLTIMEOUT before giving up ... and the buildfarm's AIX membershave that set very high.To fix, include a "kill(pid, 0)" test (similar to whatpostmaster_is_alive uses) in these wait loops, so that we'llgive up immediately if the postmaster PID disappears.While here, I chose to refactor those loops out of where they were.do_stop() and do_restart() can perfectly well share one copy of thewait-for-stop loop, and it seems desirable to put a similar functionbeside that for wait-for-promote.Back-patch to all supported versions, since pg_ctl's wait logicis substantially identical in all, and we're seeing the slow testbehavior in all branches.Discussion:https://postgr.es/m/20220210023537.GA3222837@rfd.leadboat.com
1 parent64dd648 commit61c20a5

File tree

1 file changed

+80
-48
lines changed

1 file changed

+80
-48
lines changed

‎src/bin/pg_ctl/pg_ctl.c

Lines changed: 80 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,9 @@ static void free_readfile(char **optlines);
159159
staticpgpid_tstart_postmaster(void);
160160
staticvoidread_post_opts(void);
161161

162-
staticWaitPMResultwait_for_postmaster(pgpid_tpm_pid,booldo_checkpoint);
162+
staticWaitPMResultwait_for_postmaster_start(pgpid_tpm_pid,booldo_checkpoint);
163+
staticboolwait_for_postmaster_stop(void);
164+
staticboolwait_for_postmaster_promote(void);
163165
staticboolpostmaster_is_alive(pid_tpid);
164166

165167
#if defined(HAVE_GETRLIMIT)&& defined(RLIMIT_CORE)
@@ -590,7 +592,7 @@ start_postmaster(void)
590592
* manager checkpoint, it's got nothing to do with database checkpoints!!
591593
*/
592594
staticWaitPMResult
593-
wait_for_postmaster(pgpid_tpm_pid,booldo_checkpoint)
595+
wait_for_postmaster_start(pgpid_tpm_pid,booldo_checkpoint)
594596
{
595597
inti;
596598

@@ -700,6 +702,76 @@ wait_for_postmaster(pgpid_t pm_pid, bool do_checkpoint)
700702
}
701703

702704

705+
/*
706+
* Wait for the postmaster to stop.
707+
*
708+
* Returns true if the postmaster stopped cleanly (i.e., removed its pidfile).
709+
* Returns false if the postmaster dies uncleanly, or if we time out.
710+
*/
711+
staticbool
712+
wait_for_postmaster_stop(void)
713+
{
714+
intcnt;
715+
716+
for (cnt=0;cnt<wait_seconds*WAITS_PER_SEC;cnt++)
717+
{
718+
pgpid_tpid;
719+
720+
if ((pid=get_pgpid(false))==0)
721+
return true;/* pid file is gone */
722+
723+
if (kill((pid_t)pid,0)!=0)
724+
{
725+
/*
726+
* Postmaster seems to have died. Check the pid file once more to
727+
* avoid a race condition, but give up waiting.
728+
*/
729+
if (get_pgpid(false)==0)
730+
return true;/* pid file is gone */
731+
return false;/* postmaster died untimely */
732+
}
733+
734+
if (cnt %WAITS_PER_SEC==0)
735+
print_msg(".");
736+
pg_usleep(USEC_PER_SEC /WAITS_PER_SEC);
737+
}
738+
return false;/* timeout reached */
739+
}
740+
741+
742+
/*
743+
* Wait for the postmaster to promote.
744+
*
745+
* Returns true on success, else false.
746+
* To avoid waiting uselessly, we check for postmaster death here too.
747+
*/
748+
staticbool
749+
wait_for_postmaster_promote(void)
750+
{
751+
intcnt;
752+
753+
for (cnt=0;cnt<wait_seconds*WAITS_PER_SEC;cnt++)
754+
{
755+
pgpid_tpid;
756+
DBStatestate;
757+
758+
if ((pid=get_pgpid(false))==0)
759+
return false;/* pid file is gone */
760+
if (kill((pid_t)pid,0)!=0)
761+
return false;/* postmaster died */
762+
763+
state=get_control_dbstate();
764+
if (state==DB_IN_PRODUCTION)
765+
return true;/* successful promotion */
766+
767+
if (cnt %WAITS_PER_SEC==0)
768+
print_msg(".");
769+
pg_usleep(USEC_PER_SEC /WAITS_PER_SEC);
770+
}
771+
return false;/* timeout reached */
772+
}
773+
774+
703775
#if defined(HAVE_GETRLIMIT)&& defined(RLIMIT_CORE)
704776
staticvoid
705777
unlimit_core_size(void)
@@ -915,7 +987,7 @@ do_start(void)
915987

916988
print_msg(_("waiting for server to start..."));
917989

918-
switch (wait_for_postmaster(pm_pid, false))
990+
switch (wait_for_postmaster_start(pm_pid, false))
919991
{
920992
casePOSTMASTER_READY:
921993
print_msg(_(" done\n"));
@@ -950,7 +1022,6 @@ do_start(void)
9501022
staticvoid
9511023
do_stop(void)
9521024
{
953-
intcnt;
9541025
pgpid_tpid;
9551026
structstatstatbuf;
9561027

@@ -1001,19 +1072,7 @@ do_stop(void)
10011072

10021073
print_msg(_("waiting for server to shut down..."));
10031074

1004-
for (cnt=0;cnt<wait_seconds*WAITS_PER_SEC;cnt++)
1005-
{
1006-
if ((pid=get_pgpid(false))!=0)
1007-
{
1008-
if (cnt %WAITS_PER_SEC==0)
1009-
print_msg(".");
1010-
pg_usleep(USEC_PER_SEC /WAITS_PER_SEC);
1011-
}
1012-
else
1013-
break;
1014-
}
1015-
1016-
if (pid!=0)/* pid file still exists */
1075+
if (!wait_for_postmaster_stop())
10171076
{
10181077
print_msg(_(" failed\n"));
10191078

@@ -1037,7 +1096,6 @@ do_stop(void)
10371096
staticvoid
10381097
do_restart(void)
10391098
{
1040-
intcnt;
10411099
pgpid_tpid;
10421100
structstatstatbuf;
10431101

@@ -1091,20 +1149,7 @@ do_restart(void)
10911149
print_msg(_("waiting for server to shut down..."));
10921150

10931151
/* always wait for restart */
1094-
1095-
for (cnt=0;cnt<wait_seconds*WAITS_PER_SEC;cnt++)
1096-
{
1097-
if ((pid=get_pgpid(false))!=0)
1098-
{
1099-
if (cnt %WAITS_PER_SEC==0)
1100-
print_msg(".");
1101-
pg_usleep(USEC_PER_SEC /WAITS_PER_SEC);
1102-
}
1103-
else
1104-
break;
1105-
}
1106-
1107-
if (pid!=0)/* pid file still exists */
1152+
if (!wait_for_postmaster_stop())
11081153
{
11091154
print_msg(_(" failed\n"));
11101155

@@ -1229,21 +1274,8 @@ do_promote(void)
12291274

12301275
if (do_wait)
12311276
{
1232-
DBStatestate=DB_STARTUP;
1233-
intcnt;
1234-
12351277
print_msg(_("waiting for server to promote..."));
1236-
for (cnt=0;cnt<wait_seconds*WAITS_PER_SEC;cnt++)
1237-
{
1238-
state=get_control_dbstate();
1239-
if (state==DB_IN_PRODUCTION)
1240-
break;
1241-
1242-
if (cnt %WAITS_PER_SEC==0)
1243-
print_msg(".");
1244-
pg_usleep(USEC_PER_SEC /WAITS_PER_SEC);
1245-
}
1246-
if (state==DB_IN_PRODUCTION)
1278+
if (wait_for_postmaster_promote())
12471279
{
12481280
print_msg(_(" done\n"));
12491281
print_msg(_("server promoted\n"));
@@ -1688,7 +1720,7 @@ pgwin32_ServiceMain(DWORD argc, LPTSTR *argv)
16881720
if (do_wait)
16891721
{
16901722
write_eventlog(EVENTLOG_INFORMATION_TYPE,_("Waiting for server startup...\n"));
1691-
if (wait_for_postmaster(postmasterPID, true)!=POSTMASTER_READY)
1723+
if (wait_for_postmaster_start(postmasterPID, true)!=POSTMASTER_READY)
16921724
{
16931725
write_eventlog(EVENTLOG_ERROR_TYPE,_("Timed out waiting for server startup\n"));
16941726
pgwin32_SetServiceStatus(SERVICE_STOPPED);
@@ -1709,7 +1741,7 @@ pgwin32_ServiceMain(DWORD argc, LPTSTR *argv)
17091741
{
17101742
/*
17111743
* status.dwCheckPoint can be incremented by
1712-
*wait_for_postmaster(), so it might not start from 0.
1744+
*wait_for_postmaster_start(), so it might not start from 0.
17131745
*/
17141746
intmaxShutdownCheckPoint=status.dwCheckPoint+12;
17151747

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp