Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit63f64d2

Browse files
committed
Fix postmaster's handling of fork failure for a bgworker process.
This corner case didn't behave nicely at all: the postmaster would(partially) update its state as though the process had startedsuccessfully, and be quite confused thereafter. Fix it to actlike the worker had crashed, instead.In passing, refactor so that do_start_bgworker contains all thestate-change logic for bgworker launch, rather than just some of it.Back-patch as far as 9.4. 9.3 contains similar logic, but it's justenough different that I don't feel comfortable applying the patchwithout more study; and the use of bgworkers in 9.3 was so smallthat it doesn't seem worth the extra work.transam/parallel.c is still entirely unprepared for the possibilityof bgworker startup failure, but that seems like material for aseparate patch.Discussion:https://postgr.es/m/4905.1492813727@sss.pgh.pa.us
1 parent39369b4 commit63f64d2

File tree

1 file changed

+77
-31
lines changed

1 file changed

+77
-31
lines changed

‎src/backend/postmaster/postmaster.c

Lines changed: 77 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -412,6 +412,7 @@ static void TerminateChildren(int signal);
412412
#defineSignalChildren(sig) SignalSomeChildren(sig, BACKEND_TYPE_ALL)
413413

414414
staticintCountChildren(inttarget);
415+
staticboolassign_backendlist_entry(RegisteredBgWorker*rw);
415416
staticvoidmaybe_start_bgworker(void);
416417
staticboolCreateOptsFile(intargc,char*argv[],char*fullprogname);
417418
staticpid_tStartChildProcess(AuxProcTypetype);
@@ -5491,13 +5492,33 @@ bgworker_forkexec(int shmem_slot)
54915492
* Start a new bgworker.
54925493
* Starting time conditions must have been checked already.
54935494
*
5495+
* Returns true on success, false on failure.
5496+
* In either case, update the RegisteredBgWorker's state appropriately.
5497+
*
54945498
* This code is heavily based on autovacuum.c, q.v.
54955499
*/
5496-
staticvoid
5500+
staticbool
54975501
do_start_bgworker(RegisteredBgWorker*rw)
54985502
{
54995503
pid_tworker_pid;
55005504

5505+
Assert(rw->rw_pid==0);
5506+
5507+
/*
5508+
* Allocate and assign the Backend element. Note we must do this before
5509+
* forking, so that we can handle out of memory properly.
5510+
*
5511+
* Treat failure as though the worker had crashed. That way, the
5512+
* postmaster will wait a bit before attempting to start it again; if it
5513+
* tried again right away, most likely it'd find itself repeating the
5514+
* out-of-memory or fork failure condition.
5515+
*/
5516+
if (!assign_backendlist_entry(rw))
5517+
{
5518+
rw->rw_crashed_at=GetCurrentTimestamp();
5519+
return false;
5520+
}
5521+
55015522
ereport(DEBUG1,
55025523
(errmsg("starting background worker process \"%s\"",
55035524
rw->rw_worker.bgw_name)));
@@ -5509,9 +5530,17 @@ do_start_bgworker(RegisteredBgWorker *rw)
55095530
#endif
55105531
{
55115532
case-1:
5533+
/* in postmaster, fork failed ... */
55125534
ereport(LOG,
55135535
(errmsg("could not fork worker process: %m")));
5514-
return;
5536+
/* undo what assign_backendlist_entry did */
5537+
ReleasePostmasterChildSlot(rw->rw_child_slot);
5538+
rw->rw_child_slot=0;
5539+
free(rw->rw_backend);
5540+
rw->rw_backend=NULL;
5541+
/* mark entry as crashed, so we'll try again later */
5542+
rw->rw_crashed_at=GetCurrentTimestamp();
5543+
break;
55155544

55165545
#ifndefEXEC_BACKEND
55175546
case0:
@@ -5535,14 +5564,24 @@ do_start_bgworker(RegisteredBgWorker *rw)
55355564
PostmasterContext=NULL;
55365565

55375566
StartBackgroundWorker();
5567+
5568+
exit(1);/* should not get here */
55385569
break;
55395570
#endif
55405571
default:
5572+
/* in postmaster, fork successful ... */
55415573
rw->rw_pid=worker_pid;
55425574
rw->rw_backend->pid=rw->rw_pid;
55435575
ReportBackgroundWorkerPID(rw);
5544-
break;
5576+
/* add new worker to lists of backends */
5577+
dlist_push_head(&BackendList,&rw->rw_backend->elem);
5578+
#ifdefEXEC_BACKEND
5579+
ShmemBackendArrayAdd(rw->rw_backend);
5580+
#endif
5581+
return true;
55455582
}
5583+
5584+
return false;
55465585
}
55475586

55485587
/*
@@ -5589,6 +5628,8 @@ bgworker_should_start_now(BgWorkerStartTime start_time)
55895628
* Allocate the Backend struct for a connected background worker, but don't
55905629
* add it to the list of backends just yet.
55915630
*
5631+
* On failure, return false without changing any worker state.
5632+
*
55925633
* Some info from the Backend is copied into the passed rw.
55935634
*/
55945635
staticbool
@@ -5601,14 +5642,6 @@ assign_backendlist_entry(RegisteredBgWorker *rw)
56015642
ereport(LOG,
56025643
(errcode(ERRCODE_OUT_OF_MEMORY),
56035644
errmsg("out of memory")));
5604-
5605-
/*
5606-
* The worker didn't really crash, but setting this nonzero makes
5607-
* postmaster wait a bit before attempting to start it again; if it
5608-
* tried again right away, most likely it'd find itself under the same
5609-
* memory pressure.
5610-
*/
5611-
rw->rw_crashed_at=GetCurrentTimestamp();
56125645
return false;
56135646
}
56145647

@@ -5638,20 +5671,31 @@ assign_backendlist_entry(RegisteredBgWorker *rw)
56385671
* As a side effect, the bgworker control variables are set or reset whenever
56395672
* there are more workers to start after this one, and whenever the overall
56405673
* system state requires it.
5674+
*
5675+
* The reason we start at most one worker per call is to avoid consuming the
5676+
* postmaster's attention for too long when many such requests are pending.
5677+
* As long as StartWorkerNeeded is true, ServerLoop will not block and will
5678+
* call this function again after dealing with any other issues.
56415679
*/
56425680
staticvoid
56435681
maybe_start_bgworker(void)
56445682
{
56455683
slist_mutable_iteriter;
56465684
TimestampTznow=0;
56475685

5686+
/*
5687+
* During crash recovery, we have no need to be called until the state
5688+
* transition out of recovery.
5689+
*/
56485690
if (FatalError)
56495691
{
56505692
StartWorkerNeeded= false;
56515693
HaveCrashedWorker= false;
5652-
return;/* not yet */
5694+
return;
56535695
}
56545696

5697+
/* Don't need to be called again unless we find a reason for it below */
5698+
StartWorkerNeeded= false;
56555699
HaveCrashedWorker= false;
56565700

56575701
slist_foreach_modify(iter,&BackgroundWorkerList)
@@ -5660,11 +5704,11 @@ maybe_start_bgworker(void)
56605704

56615705
rw=slist_container(RegisteredBgWorker,rw_lnode,iter.cur);
56625706

5663-
/* already running? */
5707+
/*ignore ifalready running */
56645708
if (rw->rw_pid!=0)
56655709
continue;
56665710

5667-
/* marked for death? */
5711+
/*ifmarked for death, clean up and remove from list */
56685712
if (rw->rw_terminate)
56695713
{
56705714
ForgetBackgroundWorker(&iter);
@@ -5686,48 +5730,50 @@ maybe_start_bgworker(void)
56865730
continue;
56875731
}
56885732

5733+
/* read system time only when needed */
56895734
if (now==0)
56905735
now=GetCurrentTimestamp();
56915736

56925737
if (!TimestampDifferenceExceeds(rw->rw_crashed_at,now,
56935738
rw->rw_worker.bgw_restart_time*1000))
56945739
{
5740+
/* Set flag to remember that we have workers to start later */
56955741
HaveCrashedWorker= true;
56965742
continue;
56975743
}
56985744
}
56995745

57005746
if (bgworker_should_start_now(rw->rw_worker.bgw_start_time))
57015747
{
5702-
/* reset crash time beforecalling assign_backendlist_entry */
5748+
/* reset crash time beforetrying to start worker */
57035749
rw->rw_crashed_at=0;
57045750

57055751
/*
5706-
* Allocate and assign the Backend element. Note we must do this
5707-
* before forking, so that we can handle out of memory properly.
5752+
* Try to start the worker.
5753+
*
5754+
* On failure, give up processing workers for now, but set
5755+
* StartWorkerNeeded so we'll come back here on the next iteration
5756+
* of ServerLoop to try again. (We don't want to wait, because
5757+
* there might be additional ready-to-run workers.) We could set
5758+
* HaveCrashedWorker as well, since this worker is now marked
5759+
* crashed, but there's no need because the next run of this
5760+
* function will do that.
57085761
*/
5709-
if (!assign_backendlist_entry(rw))
5762+
if (!do_start_bgworker(rw))
5763+
{
5764+
StartWorkerNeeded= true;
57105765
return;
5711-
5712-
do_start_bgworker(rw);/* sets rw->rw_pid */
5713-
5714-
dlist_push_head(&BackendList,&rw->rw_backend->elem);
5715-
#ifdefEXEC_BACKEND
5716-
ShmemBackendArrayAdd(rw->rw_backend);
5717-
#endif
5766+
}
57185767

57195768
/*
5720-
*HaveServerLoop call us again. Note that there might not
5721-
*actually *be* another runnable worker, but we don't care all
5722-
*that much; we will findout the next time we run.
5769+
*Quit, but haveServerLoop call us again to look for additional
5770+
*ready-to-run workers. There might not be any, but we'll find
5771+
* out the next time we run.
57235772
*/
57245773
StartWorkerNeeded= true;
57255774
return;
57265775
}
57275776
}
5728-
5729-
/* no runnable worker found */
5730-
StartWorkerNeeded= false;
57315777
}
57325778

57335779
/*

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp