@@ -366,6 +366,10 @@ start_postmaster(void)
366366/*
367367 * Since there might be quotes to handle here, it is easier simply to pass
368368 * everything to a shell to process them.
369+ *
370+ * XXX it would be better to fork and exec so that we would know the
371+ * child postmaster's PID directly; then test_postmaster_connection could
372+ * use the PID without having to rely on reading it back from the pidfile.
369373 */
370374if (log_file != NULL )
371375snprintf (cmd ,MAXPGPATH ,SYSTEMQUOTE "\"%s\" %s%s < \"%s\" >> \"%s\" 2>&1 &" SYSTEMQUOTE ,
@@ -412,6 +416,8 @@ static PGPing
412416test_postmaster_connection (bool do_checkpoint )
413417{
414418PGPing ret = PQPING_NO_RESPONSE ;
419+ bool found_stale_pidfile = false;
420+ pgpid_t pm_pid = 0 ;
415421char connstr [MAXPGPATH * 2 + 256 ];
416422int i ;
417423
@@ -458,78 +464,94 @@ test_postmaster_connection(bool do_checkpoint)
458464if (optlines [3 ]== NULL )
459465{
460466/* File is exactly three lines, must be pre-9.1 */
461- write_stderr (_ ("%s: -w option is not supported when starting a pre-9.1 server\n" ),
467+ write_stderr (_ ("\n %s: -w option is not supported when starting a pre-9.1 server\n" ),
462468progname );
463469return PQPING_NO_ATTEMPT ;
464470}
465471else if (optlines [4 ]!= NULL &&
466472optlines [5 ]!= NULL )
467473{
468474/* File is complete enough for us, parse it */
475+ long pmpid ;
469476time_t pmstart ;
470- int portnum ;
471- char * sockdir ;
472- char * hostaddr ;
473- char host_str [MAXPGPATH ];
474477
475478/*
476- * Easy cross-check that we are looking at the right data
477- * directory: is the postmaster older than this execution
478- * of pg_ctl? Subtract 2 seconds to allow for possible
479- * clock skew between pg_ctl and the postmaster.
479+ * Make sanity checks. If it's for a standalone backend
480+ * (negative PID), or the recorded start time is before
481+ * pg_ctl started, then either we are looking at the wrong
482+ * data directory, or this is a pre-existing pidfile that
483+ * hasn't (yet?) been overwritten by our child postmaster.
484+ * Allow 2 seconds slop for possible cross-process clock
485+ * skew.
480486 */
487+ pmpid = atol (optlines [LOCK_FILE_LINE_PID - 1 ]);
481488pmstart = atol (optlines [LOCK_FILE_LINE_START_TIME - 1 ]);
482- if (pmstart < start_time - 2 )
489+ if (pmpid <= 0 || pmstart < start_time - 2 )
483490{
484- write_stderr (_ ("%s: this data directory is running a pre-existing postmaster\n" ),
485- progname );
486- return PQPING_NO_ATTEMPT ;
491+ /*
492+ * Set flag to report stale pidfile if it doesn't
493+ * get overwritten before we give up waiting.
494+ */
495+ found_stale_pidfile = true;
487496}
497+ else
498+ {
499+ /*
500+ * OK, seems to be a valid pidfile from our child.
501+ */
502+ int portnum ;
503+ char * sockdir ;
504+ char * hostaddr ;
505+ char host_str [MAXPGPATH ];
488506
489- /*
490- * OK, extract port number and host string to use. Prefer
491- * using Unix socket if available.
492- */
493- portnum = atoi (optlines [LOCK_FILE_LINE_PORT - 1 ]);
507+ found_stale_pidfile = false;
508+ pm_pid = (pgpid_t )pmpid ;
494509
495- sockdir = optlines [LOCK_FILE_LINE_SOCKET_DIR - 1 ];
496- hostaddr = optlines [LOCK_FILE_LINE_LISTEN_ADDR - 1 ];
510+ /*
511+ * Extract port number and host string to use. Prefer
512+ * using Unix socket if available.
513+ */
514+ portnum = atoi (optlines [LOCK_FILE_LINE_PORT - 1 ]);
515+ sockdir = optlines [LOCK_FILE_LINE_SOCKET_DIR - 1 ];
516+ hostaddr = optlines [LOCK_FILE_LINE_LISTEN_ADDR - 1 ];
497517
498- /*
499- * While unix_socket_directory can accept relative
500- * directories, libpq's host parameter must have a leading
501- * slash to indicate a socket directory. So, ignore
502- * sockdir if it's relative, and try to use TCP instead.
503- */
504- if (sockdir [0 ]== '/' )
505- strlcpy (host_str ,sockdir ,sizeof (host_str ));
506- else
507- strlcpy (host_str ,hostaddr ,sizeof (host_str ));
518+ /*
519+ * While unix_socket_directory can accept relative
520+ * directories, libpq's host parameter must have a
521+ * leading slash to indicate a socket directory. So,
522+ * ignore sockdir if it's relative, and try to use TCP
523+ * instead.
524+ */
525+ if (sockdir [0 ]== '/' )
526+ strlcpy (host_str ,sockdir ,sizeof (host_str ));
527+ else
528+ strlcpy (host_str ,hostaddr ,sizeof (host_str ));
508529
509- /* remove trailing newline */
510- if (strchr (host_str ,'\n' )!= NULL )
511- * strchr (host_str ,'\n' )= '\0' ;
530+ /* remove trailing newline */
531+ if (strchr (host_str ,'\n' )!= NULL )
532+ * strchr (host_str ,'\n' )= '\0' ;
512533
513- /* Fail if we couldn't get either sockdir or host addr */
514- if (host_str [0 ]== '\0' )
515- {
516- write_stderr (_ ("%s: -w option cannot use a relative socket directory specification\n" ),
517- progname );
518- return PQPING_NO_ATTEMPT ;
519- }
534+ /* Fail if couldn't get either sockdir or host addr */
535+ if (host_str [0 ]== '\0' )
536+ {
537+ write_stderr (_ ("\n %s: -w option cannot use a relative socket directory specification\n" ),
538+ progname );
539+ return PQPING_NO_ATTEMPT ;
540+ }
520541
521- /* If postmaster is listening on "*", use" localhost" */
522- if (strcmp (host_str ,"*" )== 0 )
523- strcpy (host_str ,"localhost" );
542+ /* If postmaster is listening on "*", use localhost */
543+ if (strcmp (host_str ,"*" )== 0 )
544+ strcpy (host_str ,"localhost" );
524545
525- /*
526- * We need to set connect_timeout otherwise on Windows the
527- * Service Control Manager (SCM) will probably timeout
528- * first.
529- */
530- snprintf (connstr ,sizeof (connstr ),
531- "dbname=postgres port=%d host='%s' connect_timeout=5" ,
532- portnum ,host_str );
546+ /*
547+ * We need to set connect_timeout otherwise on Windows
548+ * the Service Control Manager (SCM) will probably
549+ * timeout first.
550+ */
551+ snprintf (connstr ,sizeof (connstr ),
552+ "dbname=postgres port=%d host='%s' connect_timeout=5" ,
553+ portnum ,host_str );
554+ }
533555}
534556}
535557}
@@ -545,16 +567,36 @@ test_postmaster_connection(bool do_checkpoint)
545567/*
546568 * The postmaster should create postmaster.pid very soon after being
547569 * started. If it's not there after we've waited 5 or more seconds,
548- * assume startup failed and give up waiting.
570+ * assume startup failed and give up waiting. (Note this covers
571+ * both cases where the pidfile was never created, and where it was
572+ * created and then removed during postmaster exit.) Also, if there
573+ * *is* a file there but it appears stale, issue a suitable warning
574+ * and give up waiting.
549575 */
550576if (i >=5 )
551577{
552578struct stat statbuf ;
553579
554580if (stat (pid_file ,& statbuf )!= 0 )
555581return PQPING_NO_RESPONSE ;
582+
583+ if (found_stale_pidfile )
584+ {
585+ write_stderr (_ ("\n%s: this data directory appears to be running a pre-existing postmaster\n" ),
586+ progname );
587+ return PQPING_NO_RESPONSE ;
588+ }
556589}
557590
591+ /*
592+ * If we've been able to identify the child postmaster's PID, check
593+ * the process is still alive. This covers cases where the postmaster
594+ * successfully created the pidfile but then crashed without removing
595+ * it.
596+ */
597+ if (pm_pid > 0 && !postmaster_is_alive ((pid_t )pm_pid ))
598+ return PQPING_NO_RESPONSE ;
599+
558600/* No response, or startup still in process; wait */
559601#if defined(WIN32 )
560602if (do_checkpoint )
@@ -715,7 +757,6 @@ do_init(void)
715757static void
716758do_start (void )
717759{
718- pgpid_t pid ;
719760pgpid_t old_pid = 0 ;
720761int exitcode ;
721762
@@ -765,19 +806,6 @@ do_start(void)
765806exit (1 );
766807}
767808
768- if (old_pid != 0 )
769- {
770- pg_usleep (1000000 );
771- pid = get_pgpid ();
772- if (pid == old_pid )
773- {
774- write_stderr (_ ("%s: could not start server\n"
775- "Examine the log output.\n" ),
776- progname );
777- exit (1 );
778- }
779- }
780-
781809if (do_wait )
782810{
783811print_msg (_ ("waiting for server to start..." ));