1111 *
1212 *
1313 * IDENTIFICATION
14- * $Header: /cvsroot/pgsql/src/backend/postmaster/postmaster.c,v 1.200 2000/12/18 18:45:04 momjian Exp $
14+ * $Header: /cvsroot/pgsql/src/backend/postmaster/postmaster.c,v 1.201 2000/12/20 21:51:52 tgl Exp $
1515 *
1616 * NOTES
1717 *
@@ -180,7 +180,7 @@ static time_tcheckpointed = 0;
180180
181181static int Shutdown = NoShutdown ;
182182
183- static bool FatalError = false;
183+ static bool FatalError = false;/* T if recovering from backend crash */
184184
185185/*
186186 * State for assigning random salts and cancel keys.
@@ -649,7 +649,7 @@ PostmasterMain(int argc, char *argv[])
649649pqsignal (SIGTERM ,pmdie );/* wait for children and ShutdownDataBase */
650650pqsignal (SIGALRM ,SIG_IGN );/* ignored */
651651pqsignal (SIGPIPE ,SIG_IGN );/* ignored */
652- pqsignal (SIGUSR1 ,SIG_IGN ); /* ignored */
652+ pqsignal (SIGUSR1 ,pmdie ); /*currently ignored, but see note in pmdie */
653653pqsignal (SIGUSR2 ,pmdie );/* send SIGUSR2, don't die */
654654pqsignal (SIGCHLD ,reaper );/* handle child termination */
655655pqsignal (SIGTTIN ,SIG_IGN );/* ignored */
@@ -1329,6 +1329,18 @@ pmdie(SIGNAL_ARGS)
13291329
13301330switch (postgres_signal_arg )
13311331{
1332+ case SIGUSR1 :
1333+ /*
1334+ * Currently the postmaster ignores SIGUSR1 (maybe it should
1335+ * do something useful instead?) But we must have some handler
1336+ * installed for SIGUSR1, not just set it to SIG_IGN. Else, a
1337+ * freshly spawned backend would likewise have it set to SIG_IGN,
1338+ * which would mean the backend would ignore any attempt to kill
1339+ * it before it had gotten as far as setting up its own handler.
1340+ */
1341+ errno = save_errno ;
1342+ return ;
1343+
13321344case SIGUSR2 :
13331345
13341346/*
@@ -1511,7 +1523,7 @@ reaper(SIGNAL_ARGS)
15111523ExitPostmaster (1 );
15121524}
15131525StartupPID = 0 ;
1514- FatalError = false;
1526+ FatalError = false;/* done with recovery */
15151527if (Shutdown > NoShutdown )
15161528{
15171529if (ShutdownPID > 0 )
@@ -1539,12 +1551,7 @@ reaper(SIGNAL_ARGS)
15391551/*
15401552 * Wait for all children exit, then reset shmem and StartupDataBase.
15411553 */
1542- if (DLGetHead (BackendList ))
1543- {
1544- errno = save_errno ;
1545- return ;
1546- }
1547- if (StartupPID > 0 || ShutdownPID > 0 )
1554+ if (DLGetHead (BackendList )|| StartupPID > 0 || ShutdownPID > 0 )
15481555{
15491556errno = save_errno ;
15501557return ;
@@ -1595,21 +1602,18 @@ CleanupProc(int pid,
15951602Dlelem * curr ,
15961603* next ;
15971604Backend * bp ;
1598- int sig ;
15991605
16001606if (DebugLvl )
1601- {
16021607fprintf (stderr ,"%s: CleanupProc: pid %d exited with status %d\n" ,
16031608progname ,pid ,exitstatus );
1604- }
16051609
16061610/*
16071611 * If a backend dies in an ugly way (i.e. exit status not 0) then we
16081612 * must signal all other backends to quickdie.If exit status is zero
16091613 * we assume everything is hunky dory and simply remove the backend
16101614 * from the active backend list.
16111615 */
1612- if (! exitstatus )
1616+ if (exitstatus == 0 )
16131617{
16141618curr = DLGetHead (BackendList );
16151619while (curr )
@@ -1628,73 +1632,78 @@ CleanupProc(int pid,
16281632if (pid == CheckPointPID )
16291633{
16301634CheckPointPID = 0 ;
1631- checkpointed = time (NULL );
1635+ if (!FatalError )
1636+ checkpointed = time (NULL );
16321637}
16331638else
1634- ProcRemove (pid );
1639+ {
1640+ /* Why is this done here, and not by the backend itself? */
1641+ if (!FatalError )
1642+ ProcRemove (pid );
1643+ }
16351644
16361645return ;
16371646}
16381647
16391648if (!FatalError )
16401649{
1650+ /* Make log entry unless we did so already */
16411651tnow = time (NULL );
16421652fprintf (stderr ,"Server process (pid %d) exited with status %d at %s"
16431653"Terminating any active server processes...\n" ,
16441654pid ,exitstatus ,ctime (& tnow ));
16451655fflush (stderr );
16461656}
1647- FatalError = true;
1657+
16481658curr = DLGetHead (BackendList );
16491659while (curr )
16501660{
16511661next = DLGetSucc (curr );
16521662bp = (Backend * )DLE_VAL (curr );
1653-
1654- /*
1655- * SIGUSR1 is the special signal that says exit without proc_exit
1656- * and let the user know what's going on. ProcSemaphoreKill()
1657- * cleans up the backends semaphore. If SendStop is set (-s on
1658- * command line), then we send a SIGSTOP so that we can core dumps
1659- * from all backends by hand.
1660- */
1661- sig = (SendStop ) ?SIGSTOP :SIGUSR1 ;
16621663if (bp -> pid != pid )
16631664{
1664- if (DebugLvl )
1665- fprintf (stderr ,"%s: CleanupProc: sending %s to process %d\n" ,
1666- progname ,
1667- (sig == SIGUSR1 )
1668- ?"SIGUSR1" :"SIGSTOP" ,
1669- bp -> pid );
1670- kill (bp -> pid ,sig );
1665+ /*
1666+ * This backend is still alive. Unless we did so already,
1667+ * tell it to commit hara-kiri.
1668+ *
1669+ * SIGUSR1 is the special signal that says exit without proc_exit
1670+ * and let the user know what's going on. But if SendStop is set
1671+ * (-s on command line), then we send SIGSTOP instead, so that we
1672+ * can get core dumps from all backends by hand.
1673+ */
1674+ if (!FatalError )
1675+ {
1676+ if (DebugLvl )
1677+ fprintf (stderr ,"%s: CleanupProc: sending %s to process %d\n" ,
1678+ progname ,
1679+ (SendStop ?"SIGSTOP" :"SIGUSR1" ),
1680+ bp -> pid );
1681+ kill (bp -> pid , (SendStop ?SIGSTOP :SIGUSR1 ));
1682+ }
16711683}
16721684else
16731685{
1674-
16751686/*
1676- * I don't like that we call ProcRemove() here, assuming that
1677- * shmem may be corrupted! But is there another way to free
1678- * backend semaphores? Actually, I believe that we need not in
1679- * per backend semaphore at all (we use them to wait on lock
1680- * only, couldn't we just sigpause?), so probably we'll remove
1681- * this call from here someday. -- vadim 04-10-1999
1687+ * Found entry for freshly-dead backend, so remove it.
1688+ *
1689+ * Don't call ProcRemove() here, since shmem may be corrupted!
1690+ * We are going to reinitialize shmem and semaphores anyway
1691+ * once all the children are dead, so no need for it.
16821692 */
1683- if (pid == CheckPointPID )
1684- {
1685- CheckPointPID = 0 ;
1686- checkpointed = 0 ;
1687- }
1688- else
1689- ProcRemove (pid );
1690-
16911693DLRemove (curr );
16921694free (bp );
16931695DLFreeElem (curr );
16941696}
16951697curr = next ;
16961698}
16971699
1700+ if (pid == CheckPointPID )
1701+ {
1702+ CheckPointPID = 0 ;
1703+ checkpointed = 0 ;
1704+ }
1705+
1706+ FatalError = true;
16981707}
16991708
17001709/*