Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitb0daba5

Browse files
committed
Tolerate timeline switches while "pg_basebackup -X fetch" is running.
If you take a base backup from a standby server with "pg_basebackup -Xfetch", and the timeline switches while the backup is being taken, thebackup used to fail with an error "requested WAL segment %s has alreadybeen removed". This is because the server-side code that sends over therequired WAL files would not construct the WAL filename with the correcttimeline after a switch.Fix that by using readdir() to scan pg_xlog for all the WAL segments in therange, regardless of timeline.Also, include all timeline history files in the backup, if taken with"-X fetch". That fixes another related bug: If a timeline switch happenedjust before the backup was initiated in a standby, the WAL segmentcontaining the initial checkpoint record contains WAL from the oldertimeline too. Recovery will not accept that without a timeline history filethat lists the older timeline.Backpatch to 9.2. Versions prior to that were not affected as you could nottake a base backup from a standby before 9.2.
1 parentee99427 commitb0daba5

File tree

4 files changed

+211
-53
lines changed

4 files changed

+211
-53
lines changed

‎src/backend/access/transam/xlog.c

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2797,18 +2797,33 @@ PreallocXlogFiles(XLogRecPtr endptr)
27972797
}
27982798

27992799
/*
2800-
* Get the segno of the latest removed or recycled WAL segment.
2801-
* Returns 0/0 if no WAL segments have been removed since startup.
2800+
* Throws an error if the given log segment has already been removed or
2801+
* recycled. The caller should only pass a segment that it knows to have
2802+
* existed while the server has been running, as this function always
2803+
* succeeds if no WAL segments have been removed since startup.
2804+
* 'tli' is only used in the error message.
28022805
*/
28032806
void
2804-
XLogGetLastRemoved(XLogSegNo*segno)
2807+
CheckXLogRemoved(XLogSegNosegno,TimeLineIDtli)
28052808
{
28062809
/* use volatile pointer to prevent code rearrangement */
28072810
volatileXLogCtlData*xlogctl=XLogCtl;
2811+
XLogSegNolastRemovedSegNo;
28082812

28092813
SpinLockAcquire(&xlogctl->info_lck);
2810-
*segno=xlogctl->lastRemovedSegNo;
2814+
lastRemovedSegNo=xlogctl->lastRemovedSegNo;
28112815
SpinLockRelease(&xlogctl->info_lck);
2816+
2817+
if (segno <=lastRemovedSegNo)
2818+
{
2819+
charfilename[MAXFNAMELEN];
2820+
2821+
XLogFileName(filename,tli,segno);
2822+
ereport(ERROR,
2823+
(errcode_for_file_access(),
2824+
errmsg("requested WAL segment %s has already been removed",
2825+
filename)));
2826+
}
28122827
}
28132828

28142829
/*

‎src/backend/replication/basebackup.c

Lines changed: 190 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -56,14 +56,13 @@ static void base_backup_cleanup(int code, Datum arg);
5656
staticvoidperform_base_backup(basebackup_options*opt,DIR*tblspcdir);
5757
staticvoidparse_basebackup_options(List*options,basebackup_options*opt);
5858
staticvoidSendXlogRecPtrResult(XLogRecPtrptr);
59+
staticintcompareWalFileNames(constvoid*a,constvoid*b);
5960

6061
/* Was the backup currently in-progress initiated in recovery mode? */
6162
staticboolbackup_started_in_recovery= false;
6263

6364
/*
6465
* Size of each block sent into the tar stream for larger files.
65-
*
66-
* XLogSegSize *MUST* be evenly dividable by this
6766
*/
6867
#defineTAR_SEND_SIZE 32768
6968

@@ -227,64 +226,201 @@ perform_base_backup(basebackup_options *opt, DIR *tblspcdir)
227226
* We've left the last tar file "open", so we can now append the
228227
* required WAL files to it.
229228
*/
230-
XLogSegNologsegno;
231-
XLogSegNoendlogsegno;
229+
charpathbuf[MAXPGPATH];
230+
XLogSegNosegno;
231+
XLogSegNostartsegno;
232+
XLogSegNoendsegno;
232233
structstatstatbuf;
234+
List*historyFileList=NIL;
235+
List*walFileList=NIL;
236+
char**walFiles;
237+
intnWalFiles;
238+
charfirstoff[MAXFNAMELEN];
239+
charlastoff[MAXFNAMELEN];
240+
DIR*dir;
241+
structdirent*de;
242+
inti;
243+
ListCell*lc;
244+
TimeLineIDtli;
233245

234-
MemSet(&statbuf,0,sizeof(statbuf));
235-
statbuf.st_mode=S_IRUSR |S_IWUSR;
236-
#ifndefWIN32
237-
statbuf.st_uid=geteuid();
238-
statbuf.st_gid=getegid();
239-
#endif
240-
statbuf.st_size=XLogSegSize;
241-
statbuf.st_mtime=time(NULL);
246+
/*
247+
* I'd rather not worry about timelines here, so scan pg_xlog and
248+
* include all WAL files in the range between 'startptr' and 'endptr',
249+
* regardless of the timeline the file is stamped with. If there are
250+
* some spurious WAL files belonging to timelines that don't belong
251+
* in this server's history, they will be included too. Normally there
252+
* shouldn't be such files, but if there are, there's little harm in
253+
* including them.
254+
*/
255+
XLByteToSeg(startptr,startsegno);
256+
XLogFileName(firstoff,ThisTimeLineID,startsegno);
257+
XLByteToPrevSeg(endptr,endsegno);
258+
XLogFileName(lastoff,ThisTimeLineID,endsegno);
259+
260+
dir=AllocateDir("pg_xlog");
261+
if (!dir)
262+
ereport(ERROR,
263+
(errmsg("could not open directory \"%s\": %m","pg_xlog")));
264+
while ((de=ReadDir(dir,"pg_xlog"))!=NULL)
265+
{
266+
/* Does it look like a WAL segment, and is it in the range? */
267+
if (strlen(de->d_name)==24&&
268+
strspn(de->d_name,"0123456789ABCDEF")==24&&
269+
strcmp(de->d_name+8,firstoff+8) >=0&&
270+
strcmp(de->d_name+8,lastoff+8) <=0)
271+
{
272+
walFileList=lappend(walFileList,pstrdup(de->d_name));
273+
}
274+
/* Does it look like a timeline history file? */
275+
elseif (strlen(de->d_name)==8+strlen(".history")&&
276+
strspn(de->d_name,"0123456789ABCDEF")==8&&
277+
strcmp(de->d_name+8,".history")==0)
278+
{
279+
historyFileList=lappend(historyFileList,pstrdup(de->d_name));
280+
}
281+
}
282+
FreeDir(dir);
242283

243-
XLByteToSeg(startptr,logsegno);
244-
XLByteToPrevSeg(endptr,endlogsegno);
284+
/*
285+
* Before we go any further, check that none of the WAL segments we
286+
* need were removed.
287+
*/
288+
CheckXLogRemoved(startsegno,ThisTimeLineID);
245289

246-
while (true)
290+
/*
291+
* Put the WAL filenames into an array, and sort. We send the files
292+
* in order from oldest to newest, to reduce the chance that a file
293+
* is recycled before we get a chance to send it over.
294+
*/
295+
nWalFiles=list_length(walFileList);
296+
walFiles=palloc(nWalFiles*sizeof(char*));
297+
i=0;
298+
foreach(lc,walFileList)
247299
{
248-
/* Send another xlog segment */
249-
charfn[MAXPGPATH];
250-
inti;
300+
walFiles[i++]=lfirst(lc);
301+
}
302+
qsort(walFiles,nWalFiles,sizeof(char*),compareWalFileNames);
251303

252-
XLogFilePath(fn,ThisTimeLineID,logsegno);
253-
_tarWriteHeader(fn,NULL,&statbuf);
304+
/*
305+
* Sanity check: the first and last segment should cover startptr and
306+
* endptr, with no gaps in between.
307+
*/
308+
XLogFromFileName(walFiles[0],&tli,&segno);
309+
if (segno!=startsegno)
310+
{
311+
charstartfname[MAXFNAMELEN];
312+
XLogFileName(startfname,ThisTimeLineID,startsegno);
313+
ereport(ERROR,
314+
(errmsg("could not find WAL file %s",startfname)));
315+
}
316+
for (i=0;i<nWalFiles;i++)
317+
{
318+
XLogSegNocurrsegno=segno;
319+
XLogSegNonextsegno=segno+1;
254320

255-
/* Send the actual WAL file contents, block-by-block */
256-
for (i=0;i<XLogSegSize /TAR_SEND_SIZE;i++)
321+
XLogFromFileName(walFiles[i],&tli,&segno);
322+
if (!(nextsegno==segno||currsegno==segno))
257323
{
258-
charbuf[TAR_SEND_SIZE];
259-
XLogRecPtrptr;
324+
charnextfname[MAXFNAMELEN];
325+
XLogFileName(nextfname,ThisTimeLineID,nextsegno);
326+
ereport(ERROR,
327+
(errmsg("could not find WAL file %s",nextfname)));
328+
}
329+
}
330+
if (segno!=endsegno)
331+
{
332+
charendfname[MAXFNAMELEN];
333+
XLogFileName(endfname,ThisTimeLineID,endsegno);
334+
ereport(ERROR,
335+
(errmsg("could not find WAL file %s",endfname)));
336+
}
337+
338+
/* Ok, we have everything we need. Send the WAL files. */
339+
for (i=0;i<nWalFiles;i++)
340+
{
341+
FILE*fp;
342+
charbuf[TAR_SEND_SIZE];
343+
size_tcnt;
344+
pgoff_tlen=0;
260345

261-
XLogSegNoOffsetToRecPtr(logsegno,TAR_SEND_SIZE*i,ptr);
346+
snprintf(pathbuf,MAXPGPATH,XLOGDIR"/%s",walFiles[i]);
347+
XLogFromFileName(walFiles[i],&tli,&segno);
262348

349+
fp=AllocateFile(pathbuf,"rb");
350+
if (fp==NULL)
351+
{
263352
/*
264-
* Some old compilers, e.g. gcc 2.95.3/x86, think that passing
265-
* a struct in the same function as a longjump might clobber a
266-
* variable. bjm 2011-02-04
267-
* http://lists.apple.com/archives/xcode-users/2003/Dec//msg000
268-
* 51.html
353+
* Most likely reason for this is that the file was already
354+
* removed by a checkpoint, so check for that to get a better
355+
* error message.
269356
*/
270-
XLogRead(buf,ThisTimeLineID,ptr,TAR_SEND_SIZE);
271-
if (pq_putmessage('d',buf,TAR_SEND_SIZE))
357+
CheckXLogRemoved(segno,tli);
358+
359+
ereport(ERROR,
360+
(errcode_for_file_access(),
361+
errmsg("could not open file \"%s\": %m",pathbuf)));
362+
}
363+
364+
if (fstat(fileno(fp),&statbuf)!=0)
365+
ereport(ERROR,
366+
(errcode_for_file_access(),
367+
errmsg("could not stat file \"%s\": %m",
368+
pathbuf)));
369+
if (statbuf.st_size!=XLogSegSize)
370+
{
371+
CheckXLogRemoved(segno,tli);
372+
ereport(ERROR,
373+
(errcode_for_file_access(),
374+
errmsg("unexpected WAL file size \"%s\"",walFiles[i])));
375+
}
376+
377+
_tarWriteHeader(pathbuf,NULL,&statbuf);
378+
379+
while ((cnt=fread(buf,1,Min(sizeof(buf),XLogSegSize-len),fp))>0)
380+
{
381+
CheckXLogRemoved(segno,tli);
382+
/* Send the chunk as a CopyData message */
383+
if (pq_putmessage('d',buf,cnt))
272384
ereport(ERROR,
273385
(errmsg("base backup could not send data, aborting backup")));
386+
387+
len+=cnt;
388+
if (len==XLogSegSize)
389+
break;
274390
}
275391

276-
/*
277-
* Files are always fixed size, and always end on a 512 byte
278-
* boundary, so padding is never necessary.
279-
*/
392+
if (len!=XLogSegSize)
393+
{
394+
CheckXLogRemoved(segno,tli);
395+
ereport(ERROR,
396+
(errcode_for_file_access(),
397+
errmsg("unexpected WAL file size \"%s\"",walFiles[i])));
398+
}
280399

400+
/* XLogSegSize is a multiple of 512, so no need for padding */
401+
FreeFile(fp);
402+
}
281403

282-
/* Advance to the next WAL file */
283-
logsegno++;
404+
/*
405+
* Send timeline history files too. Only the latest timeline history
406+
* file is required for recovery, and even that only if there happens
407+
* to be a timeline switch in the first WAL segment that contains the
408+
* checkpoint record, or if we're taking a base backup from a standby
409+
* server and the target timeline changes while the backup is taken.
410+
* But they are small and highly useful for debugging purposes, so
411+
* better include them all, always.
412+
*/
413+
foreach(lc,historyFileList)
414+
{
415+
char*fname=lfirst(lc);
416+
snprintf(pathbuf,MAXPGPATH,XLOGDIR"/%s",fname);
284417

285-
/* Have we reached our stop position yet? */
286-
if (logsegno>endlogsegno)
287-
break;
418+
if (lstat(pathbuf,&statbuf)!=0)
419+
ereport(ERROR,
420+
(errcode_for_file_access(),
421+
errmsg("could not stat file \"%s\": %m",pathbuf)));
422+
423+
sendFile(pathbuf,pathbuf,&statbuf, false);
288424
}
289425

290426
/* Send CopyDone message for the last tar file */
@@ -293,6 +429,19 @@ perform_base_backup(basebackup_options *opt, DIR *tblspcdir)
293429
SendXlogRecPtrResult(endptr);
294430
}
295431

432+
/*
433+
* qsort comparison function, to compare log/seg portion of WAL segment
434+
* filenames, ignoring the timeline portion.
435+
*/
436+
staticint
437+
compareWalFileNames(constvoid*a,constvoid*b)
438+
{
439+
char*fna=*((char**)a);
440+
char*fnb=*((char**)b);
441+
442+
returnstrcmp(fna+8,fnb+8);
443+
}
444+
296445
/*
297446
* Parse the base backup options passed down by the parser
298447
*/

‎src/backend/replication/walsender.c

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1170,7 +1170,6 @@ XLogRead(char *buf, TimeLineID tli, XLogRecPtr startptr, Size count)
11701170
char*p;
11711171
XLogRecPtrrecptr;
11721172
Sizenbytes;
1173-
XLogSegNolastRemovedSegNo;
11741173
XLogSegNosegno;
11751174

11761175
retry:
@@ -1263,13 +1262,8 @@ XLogRead(char *buf, TimeLineID tli, XLogRecPtr startptr, Size count)
12631262
* read() succeeds in that case, but the data we tried to read might
12641263
* already have been overwritten with new WAL records.
12651264
*/
1266-
XLogGetLastRemoved(&lastRemovedSegNo);
12671265
XLByteToSeg(startptr,segno);
1268-
if (segno <=lastRemovedSegNo)
1269-
ereport(ERROR,
1270-
(errcode_for_file_access(),
1271-
errmsg("requested WAL segment %s has already been removed",
1272-
XLogFileNameP(sendTimeLine,segno))));
1266+
CheckXLogRemoved(segno,ThisTimeLineID);
12731267

12741268
/*
12751269
* During recovery, the currently-open WAL file might be replaced with the

‎src/include/access/xlog.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -267,7 +267,7 @@ extern bool XLogNeedsFlush(XLogRecPtr RecPtr);
267267
externintXLogFileInit(XLogSegNosegno,bool*use_existent,booluse_lock);
268268
externintXLogFileOpen(XLogSegNosegno);
269269

270-
externvoidXLogGetLastRemoved(XLogSegNo*segno);
270+
externvoidCheckXLogRemoved(XLogSegNosegno,TimeLineIDtli);
271271
externvoidXLogSetAsyncXactLSN(XLogRecPtrrecord);
272272

273273
externBufferRestoreBackupBlock(XLogRecPtrlsn,XLogRecord*record,

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp