@@ -355,10 +355,13 @@ typedef struct XLogCtlInsert
355355 * exclusiveBackup is true if a backup started with pg_start_backup() is
356356 * in progress, and nonExclusiveBackups is a counter indicating the number
357357 * of streaming base backups currently in progress. forcePageWrites is
358- * set to true when either of these is non-zero.
358+ * set to true when either of these is non-zero. lastBackupStart is the
359+ * latest checkpoint redo location used as a starting point for an online
360+ * backup.
359361 */
360362bool exclusiveBackup ;
361363int nonExclusiveBackups ;
364+ XLogRecPtr lastBackupStart ;
362365}XLogCtlInsert ;
363366
364367/*
@@ -8808,6 +8811,19 @@ do_pg_start_backup(const char *backupidstr, bool fast, char **labelfile)
88088811errmsg ("backup label too long (max %d bytes)" ,
88098812MAXPGPATH )));
88108813
8814+ /*
8815+ * Force an XLOG file switch before the checkpoint, to ensure that the WAL
8816+ * segment the checkpoint is written to doesn't contain pages with old
8817+ * timeline IDs. That would otherwise happen if you called
8818+ * pg_start_backup() right after restoring from a PITR archive: the first
8819+ * WAL segment containing the startup checkpoint has pages in the
8820+ * beginning with the old timeline ID. That can cause trouble at recovery:
8821+ * we won't have a history file covering the old timeline if pg_xlog
8822+ * directory was not included in the base backup and the WAL archive was
8823+ * cleared too before starting the backup.
8824+ */
8825+ RequestXLogSwitch ();
8826+
88118827/*
88128828 * Mark backup active in shared memory. We must do full-page WAL writes
88138829 * during an on-line backup even if not doing so at other times, because
@@ -8843,43 +8859,54 @@ do_pg_start_backup(const char *backupidstr, bool fast, char **labelfile)
88438859XLogCtl -> Insert .forcePageWrites = true;
88448860LWLockRelease (WALInsertLock );
88458861
8846- /*
8847- * Force an XLOG file switch before the checkpoint, to ensure that the WAL
8848- * segment the checkpoint is written to doesn't contain pages with old
8849- * timeline IDs. That would otherwise happen if you called
8850- * pg_start_backup() right after restoring from a PITR archive: the first
8851- * WAL segment containing the startup checkpoint has pages in the
8852- * beginning with the old timeline ID. That can cause trouble at recovery:
8853- * we won't have a history file covering the old timeline if pg_xlog
8854- * directory was not included in the base backup and the WAL archive was
8855- * cleared too before starting the backup.
8856- */
8857- RequestXLogSwitch ();
8858-
88598862/* Ensure we release forcePageWrites if fail below */
88608863PG_ENSURE_ERROR_CLEANUP (pg_start_backup_callback , (Datum )BoolGetDatum (exclusive ));
88618864{
8862- /*
8863- * Force a CHECKPOINT.Aside from being necessary to prevent torn
8864- * page problems, this guarantees that two successive backup runs will
8865- * have different checkpoint positions and hence different history
8866- * file names, even if nothing happened in between.
8867- *
8868- * We use CHECKPOINT_IMMEDIATE only if requested by user (via passing
8869- * fast = true). Otherwise this can take awhile.
8870- */
8871- RequestCheckpoint (CHECKPOINT_FORCE |CHECKPOINT_WAIT |
8872- (fast ?CHECKPOINT_IMMEDIATE :0 ));
8865+ bool gotUniqueStartpoint = false;
8866+ do
8867+ {
8868+ /*
8869+ * Force a CHECKPOINT.Aside from being necessary to prevent torn
8870+ * page problems, this guarantees that two successive backup runs will
8871+ * have different checkpoint positions and hence different history
8872+ * file names, even if nothing happened in between.
8873+ *
8874+ * We use CHECKPOINT_IMMEDIATE only if requested by user (via passing
8875+ * fast = true). Otherwise this can take awhile.
8876+ */
8877+ RequestCheckpoint (CHECKPOINT_FORCE |CHECKPOINT_WAIT |
8878+ (fast ?CHECKPOINT_IMMEDIATE :0 ));
88738879
8874- /*
8875- * Now we need to fetch the checkpoint record location, and also its
8876- * REDO pointer. The oldest point in WAL that would be needed to
8877- * restore starting from the checkpoint is precisely the REDO pointer.
8878- */
8879- LWLockAcquire (ControlFileLock ,LW_SHARED );
8880- checkpointloc = ControlFile -> checkPoint ;
8881- startpoint = ControlFile -> checkPointCopy .redo ;
8882- LWLockRelease (ControlFileLock );
8880+ /*
8881+ * Now we need to fetch the checkpoint record location, and also its
8882+ * REDO pointer. The oldest point in WAL that would be needed to
8883+ * restore starting from the checkpoint is precisely the REDO pointer.
8884+ */
8885+ LWLockAcquire (ControlFileLock ,LW_SHARED );
8886+ checkpointloc = ControlFile -> checkPoint ;
8887+ startpoint = ControlFile -> checkPointCopy .redo ;
8888+ LWLockRelease (ControlFileLock );
8889+
8890+ /*
8891+ * If two base backups are started at the same time (in WAL
8892+ * sender processes), we need to make sure that they use
8893+ * different checkpoints as starting locations, because we use
8894+ * the starting WAL location as a unique identifier for the base
8895+ * backup in the end-of-backup WAL record and when we write the
8896+ * backup history file. Perhaps it would be better generate a
8897+ * separate unique ID for each backup instead of forcing another
8898+ * checkpoint, but taking a checkpoint right after another is
8899+ * not that expensive either because only few buffers have been
8900+ * dirtied yet.
8901+ */
8902+ LWLockAcquire (WALInsertLock ,LW_SHARED );
8903+ if (XLByteLT (XLogCtl -> Insert .lastBackupStart ,startpoint ))
8904+ {
8905+ XLogCtl -> Insert .lastBackupStart = startpoint ;
8906+ gotUniqueStartpoint = true;
8907+ }
8908+ LWLockRelease (WALInsertLock );
8909+ }while (!gotUniqueStartpoint );
88838910
88848911XLByteToSeg (startpoint ,_logId ,_logSeg );
88858912XLogFileName (xlogfilename ,ThisTimeLineID ,_logId ,_logSeg );