Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitb4c99c9

Browse files
committed
Tolerate timeline switches while "pg_basebackup -X fetch" is running.
If you take a base backup from a standby server with "pg_basebackup -Xfetch", and the timeline switches while the backup is being taken, thebackup used to fail with an error "requested WAL segment %s has alreadybeen removed". This is because the server-side code that sends over therequired WAL files would not construct the WAL filename with the correcttimeline after a switch.Fix that by using readdir() to scan pg_xlog for all the WAL segments in therange, regardless of timeline.Also, include all timeline history files in the backup, if taken with"-X fetch". That fixes another related bug: If a timeline switch happenedjust before the backup was initiated in a standby, the WAL segmentcontaining the initial checkpoint record contains WAL from the oldertimeline too. Recovery will not accept that without a timeline history filethat lists the older timeline.Backpatch to 9.2. Versions prior to that were not affected as you could nottake a base backup from a standby before 9.2.
1 parentfaf1b1b commitb4c99c9

File tree

4 files changed

+217
-61
lines changed

4 files changed

+217
-61
lines changed

‎src/backend/access/transam/xlog.c

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3456,19 +3456,36 @@ PreallocXlogFiles(XLogRecPtr endptr)
34563456
}
34573457

34583458
/*
3459-
* Get the log/seg of the latest removed or recycled WAL segment.
3460-
* Returns 0/0 if no WAL segments have been removed since startup.
3459+
* Throws an error if the given log segment has already been removed or
3460+
* recycled. The caller should only pass a segment that it knows to have
3461+
* existed while the server has been running, as this function always
3462+
* succeeds if no WAL segments have been removed since startup.
3463+
* 'tli' is only used in the error message.
34613464
*/
34623465
void
3463-
XLogGetLastRemoved(uint32*log,uint32*seg)
3466+
CheckXLogRemoved(uint32log,uint32seg,TimeLineIDtli)
34643467
{
34653468
/* use volatile pointer to prevent code rearrangement */
34663469
volatileXLogCtlData*xlogctl=XLogCtl;
3470+
uint32lastRemovedLog,
3471+
lastRemovedSeg;
34673472

34683473
SpinLockAcquire(&xlogctl->info_lck);
3469-
*log=xlogctl->lastRemovedLog;
3470-
*seg=xlogctl->lastRemovedSeg;
3474+
lastRemovedLog=xlogctl->lastRemovedLog;
3475+
lastRemovedSeg=xlogctl->lastRemovedSeg;
34713476
SpinLockRelease(&xlogctl->info_lck);
3477+
3478+
if (log<lastRemovedLog||
3479+
(log==lastRemovedLog&&seg <=lastRemovedSeg))
3480+
{
3481+
charfilename[MAXFNAMELEN];
3482+
3483+
XLogFileName(filename,tli,log,seg);
3484+
ereport(ERROR,
3485+
(errcode_for_file_access(),
3486+
errmsg("requested WAL segment %s has already been removed",
3487+
filename)));
3488+
}
34723489
}
34733490

34743491
/*

‎src/backend/replication/basebackup.c

Lines changed: 193 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -55,11 +55,10 @@ static void base_backup_cleanup(int code, Datum arg);
5555
staticvoidperform_base_backup(basebackup_options*opt,DIR*tblspcdir);
5656
staticvoidparse_basebackup_options(List*options,basebackup_options*opt);
5757
staticvoidSendXlogRecPtrResult(XLogRecPtrptr);
58+
staticintcompareWalFileNames(constvoid*a,constvoid*b);
5859

5960
/*
6061
* Size of each block sent into the tar stream for larger files.
61-
*
62-
* XLogSegSize *MUST* be evenly dividable by this
6362
*/
6463
#defineTAR_SEND_SIZE 32768
6564

@@ -221,68 +220,208 @@ perform_base_backup(basebackup_options *opt, DIR *tblspcdir)
221220
* We've left the last tar file "open", so we can now append the
222221
* required WAL files to it.
223222
*/
223+
charpathbuf[MAXPGPATH];
224224
uint32logid,
225225
logseg;
226+
uint32startlogid,
227+
startlogseg;
226228
uint32endlogid,
227229
endlogseg;
228230
structstatstatbuf;
231+
List*historyFileList=NIL;
232+
List*walFileList=NIL;
233+
char**walFiles;
234+
intnWalFiles;
235+
charfirstoff[MAXFNAMELEN];
236+
charlastoff[MAXFNAMELEN];
237+
DIR*dir;
238+
structdirent*de;
239+
inti;
240+
ListCell*lc;
241+
TimeLineIDtli;
229242

230-
MemSet(&statbuf,0,sizeof(statbuf));
231-
statbuf.st_mode=S_IRUSR |S_IWUSR;
232-
#ifndefWIN32
233-
statbuf.st_uid=geteuid();
234-
statbuf.st_gid=getegid();
235-
#endif
236-
statbuf.st_size=XLogSegSize;
237-
statbuf.st_mtime=time(NULL);
238-
239-
XLByteToSeg(startptr,logid,logseg);
243+
/*
244+
* I'd rather not worry about timelines here, so scan pg_xlog and
245+
* include all WAL files in the range between 'startptr' and 'endptr',
246+
* regardless of the timeline the file is stamped with. If there are
247+
* some spurious WAL files belonging to timelines that don't belong
248+
* in this server's history, they will be included too. Normally there
249+
* shouldn't be such files, but if there are, there's little harm in
250+
* including them.
251+
*/
252+
XLByteToSeg(startptr,startlogid,startlogseg);
253+
XLogFileName(firstoff,ThisTimeLineID,startlogid,startlogseg);
240254
XLByteToPrevSeg(endptr,endlogid,endlogseg);
255+
XLogFileName(lastoff,ThisTimeLineID,endlogid,endlogseg);
241256

242-
while (true)
257+
dir=AllocateDir("pg_xlog");
258+
if (!dir)
259+
ereport(ERROR,
260+
(errmsg("could not open directory \"%s\": %m","pg_xlog")));
261+
while ((de=ReadDir(dir,"pg_xlog"))!=NULL)
243262
{
244-
/* Send another xlog segment */
245-
charfn[MAXPGPATH];
246-
inti;
263+
/* Does it look like a WAL segment, and is it in the range? */
264+
if (strlen(de->d_name)==24&&
265+
strspn(de->d_name,"0123456789ABCDEF")==24&&
266+
strcmp(de->d_name+8,firstoff+8) >=0&&
267+
strcmp(de->d_name+8,lastoff+8) <=0)
268+
{
269+
walFileList=lappend(walFileList,pstrdup(de->d_name));
270+
}
271+
/* Does it look like a timeline history file? */
272+
elseif (strlen(de->d_name)==8+strlen(".history")&&
273+
strspn(de->d_name,"0123456789ABCDEF")==8&&
274+
strcmp(de->d_name+8,".history")==0)
275+
{
276+
historyFileList=lappend(historyFileList,pstrdup(de->d_name));
277+
}
278+
}
279+
FreeDir(dir);
247280

248-
XLogFilePath(fn,ThisTimeLineID,logid,logseg);
249-
_tarWriteHeader(fn,NULL,&statbuf);
281+
/*
282+
* Before we go any further, check that none of the WAL segments we
283+
* need were removed.
284+
*/
285+
CheckXLogRemoved(startlogid,startlogseg,ThisTimeLineID);
286+
287+
/*
288+
* Put the WAL filenames into an array, and sort. We send the files
289+
* in order from oldest to newest, to reduce the chance that a file
290+
* is recycled before we get a chance to send it over.
291+
*/
292+
nWalFiles=list_length(walFileList);
293+
walFiles=palloc(nWalFiles*sizeof(char*));
294+
i=0;
295+
foreach(lc,walFileList)
296+
{
297+
walFiles[i++]=lfirst(lc);
298+
}
299+
qsort(walFiles,nWalFiles,sizeof(char*),compareWalFileNames);
250300

251-
/* Send the actual WAL file contents, block-by-block */
252-
for (i=0;i<XLogSegSize /TAR_SEND_SIZE;i++)
301+
/*
302+
* Sanity check: the first and last segment should cover startptr and
303+
* endptr, with no gaps in between.
304+
*/
305+
XLogFromFileName(walFiles[0],&tli,&logid,&logseg);
306+
if (logid!=startlogid||logseg!=startlogseg)
307+
{
308+
charstartfname[MAXFNAMELEN];
309+
XLogFileName(startfname,ThisTimeLineID,startlogid,startlogseg);
310+
ereport(ERROR,
311+
(errmsg("could not find WAL file %s",startfname)));
312+
}
313+
for (i=0;i<nWalFiles;i++)
314+
{
315+
intcurrlogid=logid,
316+
currlogseg=logseg;
317+
intnextlogid=logid,
318+
nextlogseg=logseg;
319+
NextLogSeg(nextlogid,nextlogseg);
320+
321+
XLogFromFileName(walFiles[i],&tli,&logid,&logseg);
322+
if (!((nextlogid==logid&&nextlogseg==logseg)||
323+
(currlogid==logid&&currlogseg==logseg)))
253324
{
254-
charbuf[TAR_SEND_SIZE];
255-
XLogRecPtrptr;
325+
charnextfname[MAXFNAMELEN];
326+
XLogFileName(nextfname,ThisTimeLineID,nextlogid,nextlogseg);
327+
ereport(ERROR,
328+
(errmsg("could not find WAL file %s",nextfname)));
329+
}
330+
}
331+
if (logid!=endlogid||logseg!=endlogseg)
332+
{
333+
charendfname[MAXFNAMELEN];
334+
XLogFileName(endfname,ThisTimeLineID,endlogid,endlogseg);
335+
ereport(ERROR,
336+
(errmsg("could not find WAL file %s",endfname)));
337+
}
338+
339+
/* Ok, we have everything we need. Send the WAL files. */
340+
for (i=0;i<nWalFiles;i++)
341+
{
342+
FILE*fp;
343+
charbuf[TAR_SEND_SIZE];
344+
size_tcnt;
345+
pgoff_tlen=0;
256346

257-
ptr.xlogid=logid;
258-
ptr.xrecoff=logseg*XLogSegSize+TAR_SEND_SIZE*i;
347+
snprintf(pathbuf,MAXPGPATH,XLOGDIR"/%s",walFiles[i]);
348+
XLogFromFileName(walFiles[i],&tli,&logid,&logseg);
259349

350+
fp=AllocateFile(pathbuf,"rb");
351+
if (fp==NULL)
352+
{
260353
/*
261-
* Some old compilers, e.g. gcc 2.95.3/x86, think that passing
262-
* a struct in the same function as a longjump might clobber a
263-
* variable. bjm 2011-02-04
264-
* http://lists.apple.com/archives/xcode-users/2003/Dec//msg000
265-
* 51.html
354+
* Most likely reason for this is that the file was already
355+
* removed by a checkpoint, so check for that to get a better
356+
* error message.
266357
*/
267-
XLogRead(buf,ptr,TAR_SEND_SIZE);
268-
if (pq_putmessage('d',buf,TAR_SEND_SIZE))
358+
CheckXLogRemoved(logid,logseg,tli);
359+
360+
ereport(ERROR,
361+
(errcode_for_file_access(),
362+
errmsg("could not open file \"%s\": %m",pathbuf)));
363+
}
364+
365+
if (fstat(fileno(fp),&statbuf)!=0)
366+
ereport(ERROR,
367+
(errcode_for_file_access(),
368+
errmsg("could not stat file \"%s\": %m",
369+
pathbuf)));
370+
if (statbuf.st_size!=XLogSegSize)
371+
{
372+
CheckXLogRemoved(logid,logseg,tli);
373+
ereport(ERROR,
374+
(errcode_for_file_access(),
375+
errmsg("unexpected WAL file size \"%s\"",walFiles[i])));
376+
}
377+
378+
_tarWriteHeader(pathbuf,NULL,&statbuf);
379+
380+
while ((cnt=fread(buf,1,Min(sizeof(buf),XLogSegSize-len),fp))>0)
381+
{
382+
CheckXLogRemoved(logid,logseg,tli);
383+
/* Send the chunk as a CopyData message */
384+
if (pq_putmessage('d',buf,cnt))
269385
ereport(ERROR,
270386
(errmsg("base backup could not send data, aborting backup")));
387+
388+
len+=cnt;
389+
if (len==XLogSegSize)
390+
break;
271391
}
272392

273-
/*
274-
* Files are always fixed size, and always end on a 512 byte
275-
* boundary, so padding is never necessary.
276-
*/
393+
if (len!=XLogSegSize)
394+
{
395+
CheckXLogRemoved(logid,logseg,tli);
396+
ereport(ERROR,
397+
(errcode_for_file_access(),
398+
errmsg("unexpected WAL file size \"%s\"",walFiles[i])));
399+
}
277400

401+
/* XLogSegSize is a multiple of 512, so no need for padding */
402+
FreeFile(fp);
403+
}
404+
405+
/*
406+
* Send timeline history files too. Only the latest timeline history
407+
* file is required for recovery, and even that only if there happens
408+
* to be a timeline switch in the first WAL segment that contains the
409+
* checkpoint record, or if we're taking a base backup from a standby
410+
* server and the target timeline changes while the backup is taken.
411+
* But they are small and highly useful for debugging purposes, so
412+
* better include them all, always.
413+
*/
414+
foreach(lc,historyFileList)
415+
{
416+
char*fname=lfirst(lc);
417+
snprintf(pathbuf,MAXPGPATH,XLOGDIR"/%s",fname);
278418

279-
/* Advance to the next WAL file */
280-
NextLogSeg(logid,logseg);
419+
if (lstat(pathbuf,&statbuf)!=0)
420+
ereport(ERROR,
421+
(errcode_for_file_access(),
422+
errmsg("could not stat file \"%s\": %m",pathbuf)));
281423

282-
/* Have we reached our stop position yet? */
283-
if (logid>endlogid||
284-
(logid==endlogid&&logseg>endlogseg))
285-
break;
424+
sendFile(pathbuf,pathbuf,&statbuf, false);
286425
}
287426

288427
/* Send CopyDone message for the last tar file */
@@ -291,6 +430,19 @@ perform_base_backup(basebackup_options *opt, DIR *tblspcdir)
291430
SendXlogRecPtrResult(endptr);
292431
}
293432

433+
/*
434+
* qsort comparison function, to compare log/seg portion of WAL segment
435+
* filenames, ignoring the timeline portion.
436+
*/
437+
staticint
438+
compareWalFileNames(constvoid*a,constvoid*b)
439+
{
440+
char*fna=*((char**)a);
441+
char*fnb=*((char**)b);
442+
443+
returnstrcmp(fna+8,fnb+8);
444+
}
445+
294446
/*
295447
* Parse the base backup options passed down by the parser
296448
*/

‎src/backend/replication/walsender.c

Lines changed: 1 addition & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -977,8 +977,6 @@ XLogRead(char *buf, XLogRecPtr startptr, Size count)
977977
char*p;
978978
XLogRecPtrrecptr;
979979
Sizenbytes;
980-
uint32lastRemovedLog;
981-
uint32lastRemovedSeg;
982980
uint32log;
983981
uint32seg;
984982

@@ -1073,19 +1071,8 @@ XLogRead(char *buf, XLogRecPtr startptr, Size count)
10731071
* read() succeeds in that case, but the data we tried to read might
10741072
* already have been overwritten with new WAL records.
10751073
*/
1076-
XLogGetLastRemoved(&lastRemovedLog,&lastRemovedSeg);
10771074
XLByteToSeg(startptr,log,seg);
1078-
if (log<lastRemovedLog||
1079-
(log==lastRemovedLog&&seg <=lastRemovedSeg))
1080-
{
1081-
charfilename[MAXFNAMELEN];
1082-
1083-
XLogFileName(filename,ThisTimeLineID,log,seg);
1084-
ereport(ERROR,
1085-
(errcode_for_file_access(),
1086-
errmsg("requested WAL segment %s has already been removed",
1087-
filename)));
1088-
}
1075+
CheckXLogRemoved(log,seg,ThisTimeLineID);
10891076

10901077
/*
10911078
* During recovery, the currently-open WAL file might be replaced with the

‎src/include/access/xlog.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -275,7 +275,7 @@ extern int XLogFileInit(uint32 log, uint32 seg,
275275
externintXLogFileOpen(uint32log,uint32seg);
276276

277277

278-
externvoidXLogGetLastRemoved(uint32*log,uint32*seg);
278+
externvoidCheckXLogRemoved(uint32log,uint32seg,TimeLineIDtli);
279279
externvoidXLogSetAsyncXactLSN(XLogRecPtrrecord);
280280

281281
externBufferRestoreBackupBlock(XLogRecPtrlsn,XLogRecord*record,

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp