77 * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
88 * Portions Copyright (c) 1994, Regents of the University of California
99 *
10- * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.235 2006/04/14 20:27:24 tgl Exp $
10+ * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.236 2006/04/17 18:55:05 tgl Exp $
1111 *
1212 *-------------------------------------------------------------------------
1313 */
@@ -344,6 +344,7 @@ typedef struct XLogCtlInsert
344344XLogPageHeader currpage ;/* points to header of block in cache */
345345char * currpos ;/* current insertion point in cache */
346346XLogRecPtr RedoRecPtr ;/* current redo point for insertions */
347+ bool forcePageWrites ;/* forcing full-page writes for PITR? */
347348}XLogCtlInsert ;
348349
349350/*
@@ -466,7 +467,7 @@ static void exitArchiveRecovery(TimeLineID endTLI,
466467uint32 endLogId ,uint32 endLogSeg );
467468static bool recoveryStopsHere (XLogRecord * record ,bool * includeThis );
468469
469- static bool XLogCheckBuffer (XLogRecData * rdata ,
470+ static bool XLogCheckBuffer (XLogRecData * rdata ,bool doPageWrites ,
470471XLogRecPtr * lsn ,BkpBlock * bkpb );
471472static bool AdvanceXLInsertBuffer (void );
472473static void XLogWrite (XLogwrtRqst WriteRqst ,bool flexible );
@@ -544,6 +545,7 @@ XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
544545unsigned i ;
545546XLogwrtRqst LogwrtRqst ;
546547bool updrqst ;
548+ bool doPageWrites ;
547549bool no_tran = (rmid == RM_XLOG_ID ) ? true : false;
548550
549551if (info & XLR_INFO_MASK )
@@ -591,6 +593,14 @@ begin:;
591593dtbuf_bkp [i ]= false;
592594}
593595
596+ /*
597+ * Decide if we need to do full-page writes in this XLOG record: true if
598+ * full_page_writes is on or we have a PITR request for it. Since we
599+ * don't yet have the insert lock, forcePageWrites could change under us,
600+ * but we'll recheck it once we have the lock.
601+ */
602+ doPageWrites = fullPageWrites || Insert -> forcePageWrites ;
603+
594604INIT_CRC32 (rdata_crc );
595605len = 0 ;
596606for (rdt = rdata ;;)
@@ -622,7 +632,8 @@ begin:;
622632{
623633/* OK, put it in this slot */
624634dtbuf [i ]= rdt -> buffer ;
625- if (XLogCheckBuffer (rdt ,& (dtbuf_lsn [i ]),& (dtbuf_xlg [i ])))
635+ if (XLogCheckBuffer (rdt ,doPageWrites ,
636+ & (dtbuf_lsn [i ]),& (dtbuf_xlg [i ])))
626637{
627638dtbuf_bkp [i ]= true;
628639rdt -> data = NULL ;
@@ -735,30 +746,51 @@ begin:;
735746 * Check to see if my RedoRecPtr is out of date. If so, may have to go
736747 * back and recompute everything. This can only happen just after a
737748 * checkpoint, so it's better to be slow in this case and fast otherwise.
749+ *
750+ * If we aren't doing full-page writes then RedoRecPtr doesn't actually
751+ * affect the contents of the XLOG record, so we'll update our local
752+ * copy but not force a recomputation.
738753 */
739754if (!XLByteEQ (RedoRecPtr ,Insert -> RedoRecPtr ))
740755{
741756Assert (XLByteLT (RedoRecPtr ,Insert -> RedoRecPtr ));
742757RedoRecPtr = Insert -> RedoRecPtr ;
743758
744- for ( i = 0 ; i < XLR_MAX_BKP_BLOCKS ; i ++ )
759+ if ( doPageWrites )
745760{
746- if (dtbuf [i ]== InvalidBuffer )
747- continue ;
748- if (dtbuf_bkp [i ]== false&&
749- XLByteLE (dtbuf_lsn [i ],RedoRecPtr ))
761+ for (i = 0 ;i < XLR_MAX_BKP_BLOCKS ;i ++ )
750762{
751- /*
752- * Oops, this buffer now needs to be backed up, but we didn't
753- * think so above.Start over.
754- */
755- LWLockRelease (WALInsertLock );
756- END_CRIT_SECTION ();
757- gotobegin ;
763+ if (dtbuf [i ]== InvalidBuffer )
764+ continue ;
765+ if (dtbuf_bkp [i ]== false&&
766+ XLByteLE (dtbuf_lsn [i ],RedoRecPtr ))
767+ {
768+ /*
769+ * Oops, this buffer now needs to be backed up, but we
770+ * didn't think so above. Start over.
771+ */
772+ LWLockRelease (WALInsertLock );
773+ END_CRIT_SECTION ();
774+ gotobegin ;
775+ }
758776}
759777}
760778}
761779
780+ /*
781+ * Also check to see if forcePageWrites was just turned on; if we
782+ * weren't already doing full-page writes then go back and recompute.
783+ * (If it was just turned off, we could recompute the record without
784+ * full pages, but we choose not to bother.)
785+ */
786+ if (Insert -> forcePageWrites && !doPageWrites )
787+ {
788+ /* Oops, must redo it with full-page data */
789+ LWLockRelease (WALInsertLock );
790+ END_CRIT_SECTION ();
791+ gotobegin ;
792+ }
793+
762794/*
763795 * Make additional rdata chain entries for the backup blocks, so that we
764796 * don't need to special-case them in the write loop. Note that we have
@@ -966,7 +998,7 @@ begin:;
966998 * save the buffer's LSN at *lsn.
967999 */
9681000static bool
969- XLogCheckBuffer (XLogRecData * rdata ,
1001+ XLogCheckBuffer (XLogRecData * rdata ,bool doPageWrites ,
9701002XLogRecPtr * lsn ,BkpBlock * bkpb )
9711003{
9721004PageHeader page ;
@@ -980,7 +1012,7 @@ XLogCheckBuffer(XLogRecData *rdata,
9801012 */
9811013* lsn = page -> pd_lsn ;
9821014
983- if (fullPageWrites &&
1015+ if (doPageWrites &&
9841016XLByteLE (page -> pd_lsn ,RedoRecPtr ))
9851017{
9861018/*
@@ -5651,76 +5683,120 @@ pg_start_backup(PG_FUNCTION_ARGS)
56515683PointerGetDatum (backupid )));
56525684
56535685/*
5654- * Force a CHECKPOINT.This is not strictly necessary, but it seems like
5655- * a good idea to minimize the amount of past WAL needed to use the
5656- * backup.Also, this guarantees that two successive backup runs will
5657- * have different checkpoint positions and hence different history file
5658- * names, even if nothing happened in between.
5686+ * Mark backup active in shared memory. We must do full-page WAL writes
5687+ * during an on-line backup even if not doing so at other times, because
5688+ * it's quite possible for the backup dump to obtain a "torn" (partially
5689+ * written) copy of a database page if it reads the page concurrently
5690+ * with our write to the same page. This can be fixed as long as the
5691+ * first write to the page in the WAL sequence is a full-page write.
5692+ * Hence, we turn on forcePageWrites and then force a CHECKPOINT, to
5693+ * ensure there are no dirty pages in shared memory that might get
5694+ * dumped while the backup is in progress without having a corresponding
5695+ * WAL record. (Once the backup is complete, we need not force full-page
5696+ * writes anymore, since we expect that any pages not modified during
5697+ * the backup interval must have been correctly captured by the backup.)
5698+ *
5699+ * We must hold WALInsertLock to change the value of forcePageWrites,
5700+ * to ensure adequate interlocking against XLogInsert().
56595701 */
5660- RequestCheckpoint (true, false);
5702+ LWLockAcquire (WALInsertLock ,LW_EXCLUSIVE );
5703+ if (XLogCtl -> Insert .forcePageWrites )
5704+ {
5705+ LWLockRelease (WALInsertLock );
5706+ ereport (ERROR ,
5707+ (errcode (ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE ),
5708+ errmsg ("a backup is already in progress" ),
5709+ errhint ("Run pg_stop_backup() and try again." )));
5710+ }
5711+ XLogCtl -> Insert .forcePageWrites = true;
5712+ LWLockRelease (WALInsertLock );
56615713
5662- /*
5663- * Now we need to fetch the checkpoint record location, and also its REDO
5664- * pointer. The oldest point in WAL that would be needed to restore
5665- * starting from the checkpoint is precisely the REDO pointer.
5666- */
5667- LWLockAcquire (ControlFileLock ,LW_EXCLUSIVE );
5668- checkpointloc = ControlFile -> checkPoint ;
5669- startpoint = ControlFile -> checkPointCopy .redo ;
5670- LWLockRelease (ControlFileLock );
5714+ /* Use a TRY block to ensure we release forcePageWrites if fail below */
5715+ PG_TRY ();
5716+ {
5717+ /*
5718+ * Force a CHECKPOINT. Aside from being necessary to prevent torn
5719+ * page problems, this guarantees that two successive backup runs will
5720+ * have different checkpoint positions and hence different history
5721+ * file names, even if nothing happened in between.
5722+ */
5723+ RequestCheckpoint (true, false);
56715724
5672- XLByteToSeg (startpoint ,_logId ,_logSeg );
5673- XLogFileName (xlogfilename ,ThisTimeLineID ,_logId ,_logSeg );
5725+ /*
5726+ * Now we need to fetch the checkpoint record location, and also its
5727+ * REDO pointer. The oldest point in WAL that would be needed to
5728+ * restore starting from the checkpoint is precisely the REDO pointer.
5729+ */
5730+ LWLockAcquire (ControlFileLock ,LW_EXCLUSIVE );
5731+ checkpointloc = ControlFile -> checkPoint ;
5732+ startpoint = ControlFile -> checkPointCopy .redo ;
5733+ LWLockRelease (ControlFileLock );
56745734
5675- /*
5676- * We deliberately use strftime/localtime not the src/timezone functions,
5677- * so that backup labels will consistently be recorded in the same
5678- * timezone regardless of TimeZone setting. This matches elog.c's
5679- * practice.
5680- */
5681- stamp_time = time (NULL );
5682- strftime (strfbuf ,sizeof (strfbuf ),
5683- "%Y-%m-%d %H:%M:%S %Z" ,
5684- localtime (& stamp_time ));
5735+ XLByteToSeg (startpoint ,_logId ,_logSeg );
5736+ XLogFileName (xlogfilename ,ThisTimeLineID ,_logId ,_logSeg );
56855737
5686- /*
5687- * Check for existing backup label --- implies a backup is already running
5688- */
5689- if (stat (BACKUP_LABEL_FILE ,& stat_buf )!= 0 )
5690- {
5691- if (errno != ENOENT )
5738+ /*
5739+ * We deliberately use strftime/localtime not the src/timezone
5740+ * functions, so that backup labels will consistently be recorded in
5741+ * the same timezone regardless of TimeZone setting. This matches
5742+ * elog.c's practice.
5743+ */
5744+ stamp_time = time (NULL );
5745+ strftime (strfbuf ,sizeof (strfbuf ),
5746+ "%Y-%m-%d %H:%M:%S %Z" ,
5747+ localtime (& stamp_time ));
5748+
5749+ /*
5750+ * Check for existing backup label --- implies a backup is already
5751+ * running. (XXX given that we checked forcePageWrites above, maybe
5752+ * it would be OK to just unlink any such label file?)
5753+ */
5754+ if (stat (BACKUP_LABEL_FILE ,& stat_buf )!= 0 )
5755+ {
5756+ if (errno != ENOENT )
5757+ ereport (ERROR ,
5758+ (errcode_for_file_access (),
5759+ errmsg ("could not stat file \"%s\": %m" ,
5760+ BACKUP_LABEL_FILE )));
5761+ }
5762+ else
5763+ ereport (ERROR ,
5764+ (errcode (ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE ),
5765+ errmsg ("a backup is already in progress" ),
5766+ errhint ("If you're sure there is no backup in progress, remove file \"%s\" and try again." ,
5767+ BACKUP_LABEL_FILE )));
5768+
5769+ /*
5770+ * Okay, write the file
5771+ */
5772+ fp = AllocateFile (BACKUP_LABEL_FILE ,"w" );
5773+ if (!fp )
56925774ereport (ERROR ,
56935775(errcode_for_file_access (),
5694- errmsg ("could not stat file \"%s\": %m" ,
5776+ errmsg ("could not create file \"%s\": %m" ,
5777+ BACKUP_LABEL_FILE )));
5778+ fprintf (fp ,"START WAL LOCATION: %X/%X (file %s)\n" ,
5779+ startpoint .xlogid ,startpoint .xrecoff ,xlogfilename );
5780+ fprintf (fp ,"CHECKPOINT LOCATION: %X/%X\n" ,
5781+ checkpointloc .xlogid ,checkpointloc .xrecoff );
5782+ fprintf (fp ,"START TIME: %s\n" ,strfbuf );
5783+ fprintf (fp ,"LABEL: %s\n" ,backupidstr );
5784+ if (fflush (fp )|| ferror (fp )|| FreeFile (fp ))
5785+ ereport (ERROR ,
5786+ (errcode_for_file_access (),
5787+ errmsg ("could not write file \"%s\": %m" ,
56955788BACKUP_LABEL_FILE )));
56965789}
5697- else
5698- ereport ( ERROR ,
5699- ( errcode ( ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE ),
5700- errmsg ( "a backup is already in progress" ),
5701- errhint ( "If you're sure there is no backup in progress, remove file \"%s\" and try again." ,
5702- BACKUP_LABEL_FILE )) );
5790+ PG_CATCH ();
5791+ {
5792+ /* Turn off forcePageWrites on failure */
5793+ LWLockAcquire ( WALInsertLock , LW_EXCLUSIVE );
5794+ XLogCtl -> Insert . forcePageWrites = false;
5795+ LWLockRelease ( WALInsertLock );
57035796
5704- /*
5705- * Okay, write the file
5706- */
5707- fp = AllocateFile (BACKUP_LABEL_FILE ,"w" );
5708- if (!fp )
5709- ereport (ERROR ,
5710- (errcode_for_file_access (),
5711- errmsg ("could not create file \"%s\": %m" ,
5712- BACKUP_LABEL_FILE )));
5713- fprintf (fp ,"START WAL LOCATION: %X/%X (file %s)\n" ,
5714- startpoint .xlogid ,startpoint .xrecoff ,xlogfilename );
5715- fprintf (fp ,"CHECKPOINT LOCATION: %X/%X\n" ,
5716- checkpointloc .xlogid ,checkpointloc .xrecoff );
5717- fprintf (fp ,"START TIME: %s\n" ,strfbuf );
5718- fprintf (fp ,"LABEL: %s\n" ,backupidstr );
5719- if (fflush (fp )|| ferror (fp )|| FreeFile (fp ))
5720- ereport (ERROR ,
5721- (errcode_for_file_access (),
5722- errmsg ("could not write file \"%s\": %m" ,
5723- BACKUP_LABEL_FILE )));
5797+ PG_RE_THROW ();
5798+ }
5799+ PG_END_TRY ();
57245800
57255801/*
57265802 * We're done. As a convenience, return the starting WAL offset.
@@ -5766,10 +5842,12 @@ pg_stop_backup(PG_FUNCTION_ARGS)
57665842
57675843/*
57685844 * Get the current end-of-WAL position; it will be unsafe to use this dump
5769- * to restore to a point in advance of this time.
5845+ * to restore to a point in advance of this time. We can also clear
5846+ * forcePageWrites here.
57705847 */
57715848LWLockAcquire (WALInsertLock ,LW_EXCLUSIVE );
57725849INSERT_RECPTR (stoppoint ,Insert ,Insert -> curridx );
5850+ XLogCtl -> Insert .forcePageWrites = false;
57735851LWLockRelease (WALInsertLock );
57745852
57755853XLByteToSeg (stoppoint ,_logId ,_logSeg );