Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitd46fa5f

Browse files
committed
[PBCKP-98] fix invalid stop lsn. Reported by Alexander Lakhin and Alex Ignatov
1 parent963f20f commitd46fa5f

File tree

3 files changed

+171
-58
lines changed

3 files changed

+171
-58
lines changed

‎src/backup.c‎

Lines changed: 78 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -401,10 +401,10 @@ do_backup_instance(PGconn *backup_conn, PGNodeInfo *nodeInfo, bool no_sync)
401401

402402
if (current.backup_mode!=BACKUP_MODE_FULL)
403403
{
404-
elog(LOG,"current_tli:%X",current.tli);
405-
elog(LOG,"prev_backup->start_lsn: %X/%X",
404+
elog(LOG,"Current tli:%X",current.tli);
405+
elog(LOG,"Parentstart_lsn: %X/%X",
406406
(uint32) (prev_backup->start_lsn >>32), (uint32) (prev_backup->start_lsn));
407-
elog(LOG,"current.start_lsn: %X/%X",
407+
elog(LOG,"start_lsn: %X/%X",
408408
(uint32) (current.start_lsn >>32), (uint32) (current.start_lsn));
409409
}
410410

@@ -583,9 +583,6 @@ do_backup_instance(PGconn *backup_conn, PGNodeInfo *nodeInfo, bool no_sync)
583583
/* Notify end of backup */
584584
pg_stop_backup(&current,pg_startbackup_conn,nodeInfo);
585585

586-
elog(LOG,"current.stop_lsn: %X/%X",
587-
(uint32) (stop_backup_lsn >>32), (uint32) (stop_backup_lsn));
588-
589586
/* In case of backup from replica >= 9.6 we must fix minRecPoint,
590587
* First we must find pg_control in backup_files_list.
591588
*/
@@ -1742,65 +1739,66 @@ pg_stop_backup(pgBackup *backup, PGconn *pg_startbackup_conn,
17421739
/* Calculate LSN */
17431740
stop_backup_lsn_tmp= ((uint64)lsn_hi) <<32 |lsn_lo;
17441741

1742+
/* It is ok for replica to return invalid STOP LSN
1743+
* UPD: Apparently it is ok even for a master.
1744+
*/
17451745
if (!XRecOffIsValid(stop_backup_lsn_tmp))
17461746
{
1747-
/* It is ok for replica to return STOP LSN with NullXRecOff
1748-
* UPD: Apparently it is ok even for master.
1749-
*/
1750-
if (XRecOffIsNull(stop_backup_lsn_tmp))
1751-
{
1752-
char*xlog_path,
1753-
stream_xlog_path[MAXPGPATH];
1754-
XLogSegNosegno=0;
1755-
XLogRecPtrlsn_tmp=InvalidXLogRecPtr;
1747+
char*xlog_path,
1748+
stream_xlog_path[MAXPGPATH];
1749+
XLogSegNosegno=0;
1750+
XLogRecPtrlsn_tmp=InvalidXLogRecPtr;
17561751

1757-
/*
1758-
* Even though the value is invalid, it's expected postgres behaviour
1759-
* and we're trying to fix it below.
1760-
*/
1761-
elog(LOG,"Null offset instop_backup_lsn value %X/%X, trying to fix",
1762-
(uint32) (stop_backup_lsn_tmp >>32), (uint32) (stop_backup_lsn_tmp));
1752+
/*
1753+
* Even though the value is invalid, it's expected postgres behaviour
1754+
* and we're trying to fix it below.
1755+
*/
1756+
elog(LOG,"Invalid offset instop_lsn value %X/%X, trying to fix",
1757+
(uint32) (stop_backup_lsn_tmp >>32), (uint32) (stop_backup_lsn_tmp));
17631758

1764-
/*
1765-
* Note: even with gdb it is very hard to produce automated tests for
1766-
* contrecord +NullXRecOff, so emulate it for manual testing.
1767-
*/
1768-
//stop_backup_lsn_tmp = stop_backup_lsn_tmp - XLOG_SEG_SIZE;
1769-
//elog(WARNING, "New Invalid stop_backup_lsn value %X/%X",
1770-
// (uint32) (stop_backup_lsn_tmp >> 32), (uint32) (stop_backup_lsn_tmp));
1759+
/*
1760+
* Note: even with gdb it is very hard to produce automated tests for
1761+
* contrecord +invalid LSN, so emulate it for manual testing.
1762+
*/
1763+
//stop_backup_lsn_tmp = stop_backup_lsn_tmp - XLOG_SEG_SIZE;
1764+
//elog(WARNING, "New Invalid stop_backup_lsn value %X/%X",
1765+
// (uint32) (stop_backup_lsn_tmp >> 32), (uint32) (stop_backup_lsn_tmp));
17711766

1772-
if (stream_wal)
1773-
{
1774-
pgBackupGetPath2(backup,stream_xlog_path,
1775-
lengthof(stream_xlog_path),
1776-
DATABASE_DIR,PG_XLOG_DIR);
1777-
xlog_path=stream_xlog_path;
1778-
}
1779-
else
1780-
xlog_path=arclog_path;
1767+
if (stream_wal)
1768+
{
1769+
pgBackupGetPath2(backup,stream_xlog_path,
1770+
lengthof(stream_xlog_path),
1771+
DATABASE_DIR,PG_XLOG_DIR);
1772+
xlog_path=stream_xlog_path;
1773+
}
1774+
else
1775+
xlog_path=arclog_path;
17811776

1782-
GetXLogSegNo(stop_backup_lsn_tmp,segno,instance_config.xlog_seg_size);
1777+
GetXLogSegNo(stop_backup_lsn_tmp,segno,instance_config.xlog_seg_size);
17831778

1784-
/*
1785-
* Note, that there is no guarantee that corresponding WAL file even exists.
1786-
* Replica may return LSN from future and keep staying in present.
1787-
* Or it can returnLSN with NullXRecOff.
1788-
*
1789-
* That's bad, since we want to get real LSN to save it in backup label file
1790-
* and to use it in WAL validation.
1791-
*
1792-
* So we try to do the following:
1793-
* 1. Wait 'archive_timeout' seconds for segment containing stop_lsn and
1794-
* look for the first valid record in it.
1795-
* It solves the problem of occasional invalidXRecOff on write-busy system.
1796-
* 2. Failing that, look for record in previous segment with endpoint
1797-
* equal or greater than stop_lsn. It may(!) solve the problem ofNullXRecOff
1798-
* on write-idle system. If that fails too, error out.
1799-
*/
1779+
/*
1780+
* Note, that there is no guarantee that corresponding WAL file even exists.
1781+
* Replica may return LSN from future and keep staying in present.
1782+
* Or it can returninvalid LSN.
1783+
*
1784+
* That's bad, since we want to get real LSN to save it in backup label file
1785+
* and to use it in WAL validation.
1786+
*
1787+
* So we try to do the following:
1788+
* 1. Wait 'archive_timeout' seconds for segment containing stop_lsn and
1789+
* look for the first valid record in it.
1790+
* It solves the problem of occasional invalidLSN on write-busy system.
1791+
* 2. Failing that, look for record in previous segment with endpoint
1792+
* equal or greater than stop_lsn. It may(!) solve the problem ofinvalid LSN
1793+
* on write-idle system. If that fails too, error out.
1794+
*/
18001795

1796+
/* stop_lsn is pointing to a 0 byte of xlog segment */
1797+
if (stop_backup_lsn_tmp %instance_config.xlog_seg_size==0)
1798+
{
18011799
/* Wait for segment with current stop_lsn, it is ok for it to never arrive */
18021800
wait_wal_lsn(stop_backup_lsn_tmp, false,backup->tli,
1803-
false, true,WARNING,stream_wal);
1801+
false, true,WARNING,stream_wal);
18041802

18051803
/* Get the first record in segment with current stop_lsn */
18061804
lsn_tmp=get_first_record_lsn(xlog_path,segno,backup->tli,
@@ -1836,17 +1834,39 @@ pg_stop_backup(pgBackup *backup, PGconn *pg_startbackup_conn,
18361834
(uint32) (stop_backup_lsn_tmp >>32),
18371835
(uint32) (stop_backup_lsn_tmp));
18381836
}
1837+
}
1838+
/* stop lsn is aligned to xlog block size, just find next lsn */
1839+
elseif (stop_backup_lsn_tmp %XLOG_BLCKSZ==0)
1840+
{
1841+
/* Wait for segment with current stop_lsn */
1842+
wait_wal_lsn(stop_backup_lsn_tmp, false,backup->tli,
1843+
false, true,ERROR,stream_wal);
1844+
1845+
/* Get the next closest record in segment with current stop_lsn */
1846+
lsn_tmp=get_next_record_lsn(xlog_path,segno,backup->tli,
1847+
instance_config.xlog_seg_size,
1848+
instance_config.archive_timeout,
1849+
stop_backup_lsn_tmp);
18391850

1840-
/* Setting stop_backup_lsn will set stop point for streaming */
1841-
stop_backup_lsn=lsn_tmp;
1842-
stop_lsn_exists= true;
1851+
/* sanity */
1852+
if (!XRecOffIsValid(lsn_tmp)||XLogRecPtrIsInvalid(lsn_tmp))
1853+
elog(ERROR,"Failed to get WAL record next to %X/%X",
1854+
(uint32) (stop_backup_lsn_tmp >>32),
1855+
(uint32) (stop_backup_lsn_tmp));
18431856
}
18441857
/* PostgreSQL returned something very illegal as STOP_LSN, error out */
18451858
else
18461859
elog(ERROR,"Invalid stop_backup_lsn value %X/%X",
18471860
(uint32) (stop_backup_lsn_tmp >>32), (uint32) (stop_backup_lsn_tmp));
1861+
1862+
/* Setting stop_backup_lsn will set stop point for streaming */
1863+
stop_backup_lsn=lsn_tmp;
1864+
stop_lsn_exists= true;
18481865
}
18491866

1867+
elog(LOG,"stop_lsn: %X/%X",
1868+
(uint32) (stop_backup_lsn >>32), (uint32) (stop_backup_lsn));
1869+
18501870
/* Write backup_label and tablespace_map */
18511871
if (!exclusive_backup)
18521872
{

‎src/parsexlog.c‎

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -680,6 +680,97 @@ get_first_record_lsn(const char *archivedir, XLogSegNosegno,
680680
returnrecord;
681681
}
682682

683+
684+
/*
685+
* Get LSN of the record next after target lsn.
686+
*/
687+
XLogRecPtr
688+
get_next_record_lsn(constchar*archivedir,XLogSegNosegno,
689+
TimeLineIDtli,uint32wal_seg_size,inttimeout,
690+
XLogRecPtrtarget)
691+
{
692+
XLogReaderState*xlogreader;
693+
XLogReaderDatareader_data;
694+
XLogRecPtrstartpoint,found,res;
695+
charwal_segment[MAXFNAMELEN];
696+
intattempts=0;
697+
698+
if (segno <=1)
699+
elog(ERROR,"Invalid WAL segment number "UINT64_FORMAT,segno);
700+
701+
GetXLogFileName(wal_segment,tli,segno,instance_config.xlog_seg_size);
702+
703+
xlogreader=InitXLogPageRead(&reader_data,archivedir,tli,wal_seg_size,
704+
false, false, true);
705+
if (xlogreader==NULL)
706+
elog(ERROR,"Out of memory");
707+
xlogreader->system_identifier=instance_config.system_identifier;
708+
709+
/* Set startpoint to 0 in segno */
710+
GetXLogRecPtr(segno,0,wal_seg_size,startpoint);
711+
712+
found=XLogFindNextRecord(xlogreader,startpoint);
713+
714+
if (XLogRecPtrIsInvalid(found))
715+
{
716+
if (xlogreader->errormsg_buf[0]!='\0')
717+
elog(WARNING,"Could not read WAL record at %X/%X: %s",
718+
(uint32) (startpoint >>32), (uint32) (startpoint),
719+
xlogreader->errormsg_buf);
720+
else
721+
elog(WARNING,"Could not read WAL record at %X/%X",
722+
(uint32) (startpoint >>32), (uint32) (startpoint));
723+
PrintXLogCorruptionMsg(&reader_data,ERROR);
724+
}
725+
startpoint=found;
726+
727+
while (attempts <=timeout)
728+
{
729+
XLogRecord*record;
730+
char*errormsg;
731+
732+
if (interrupted)
733+
elog(ERROR,"Interrupted during WAL reading");
734+
735+
record=XLogReadRecord(xlogreader,startpoint,&errormsg);
736+
737+
if (record==NULL)
738+
{
739+
XLogRecPtrerrptr;
740+
741+
errptr=XLogRecPtrIsInvalid(startpoint) ?xlogreader->EndRecPtr :
742+
startpoint;
743+
744+
if (errormsg)
745+
elog(WARNING,"Could not read WAL record at %X/%X: %s",
746+
(uint32) (errptr >>32), (uint32) (errptr),
747+
errormsg);
748+
else
749+
elog(WARNING,"Could not read WAL record at %X/%X",
750+
(uint32) (errptr >>32), (uint32) (errptr));
751+
PrintXLogCorruptionMsg(&reader_data,ERROR);
752+
}
753+
754+
if (xlogreader->ReadRecPtr >=target)
755+
{
756+
elog(LOG,"Record %X/%X is next after target LSN %X/%X",
757+
(uint32) (xlogreader->ReadRecPtr >>32), (uint32) (xlogreader->ReadRecPtr),
758+
(uint32) (target >>32), (uint32) (target));
759+
res=xlogreader->ReadRecPtr;
760+
break;
761+
}
762+
else
763+
startpoint=InvalidXLogRecPtr;
764+
}
765+
766+
/* cleanup */
767+
CleanupXLogPageRead(xlogreader);
768+
XLogReaderFree(xlogreader);
769+
770+
returnres;
771+
}
772+
773+
683774
/*
684775
* Get LSN of a record prior to target_lsn.
685776
* If 'start_lsn' is in the segment with number 'segno' then start from 'start_lsn',

‎src/pg_probackup.h‎

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -960,6 +960,8 @@ extern XLogRecPtr get_prior_record_lsn(const char *archivedir, XLogRecPtr start_
960960

961961
externXLogRecPtrget_first_record_lsn(constchar*archivedir,XLogRecPtrstart_lsn,
962962
TimeLineIDtli,uint32wal_seg_size,inttimeout);
963+
externXLogRecPtrget_next_record_lsn(constchar*archivedir,XLogSegNosegno,TimeLineIDtli,
964+
uint32wal_seg_size,inttimeout,XLogRecPtrtarget);
963965

964966
/* in util.c */
965967
externTimeLineIDget_current_timeline(PGconn*conn);

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp