Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit2280912

Browse files
committed
Fix corruption when relation truncation fails.
RelationTruncate() does three things, while holding anAccessExclusiveLock and preventing checkpoints:1. Logs the truncation.2. Drops buffers, even if they're dirty.3. Truncates some number of files.Step 2 could previously be canceled if it had to wait for I/O, and step3 could and still can fail in file APIs. All orderings of theseoperations have data corruption hazards if interrupted, so we can't giveup until the whole operation is done. When dirty pages were discardedbut the corresponding blocks were left on disk due to ERROR, old pageversions could come back from disk, reviving deleted data (seepgsql-bugs #18146 and several like it). When primary and standby wereallowed to disagree on relation size, standbys could panic (seepgsql-bugs #18426) or revive data unknown to visibility management onthe primary (theorized).Changes: * WAL is now unconditionally flushed first * smgrtruncate() is now called in a critical section, preventing interrupts and causing PANIC on file API failure * smgrtruncate() has a new parameter for existing fork sizes, because it can't call smgrnblocks() itself inside a critical sectionThe changes apply to RelationTruncate(), smgr_redo() andpg_truncate_visibility_map(). That last is also brought up to date withother evolutions of the truncation protocol.The VACUUM FileTruncate() failure mode had been discussed in olderreports than the ones referenced below, with independent analysis frommany people, but earlier theories on how to fix it were too complicatedto back-patch. The more recently invented cancellation bug wasdiagnosed by Alexander Lakhin. Other corruption scenarios were spottedby me while iterating on this patch and earlier commit75818b3.Back-patch to all supported releases.Reviewed-by: Michael Paquier <michael@paquier.xyz>Reviewed-by: Robert Haas <robertmhaas@gmail.com>Reported-by: rootcause000@gmail.comReported-by: Alexander Lakhin <exclusion@gmail.com>Discussion:https://postgr.es/m/18146-04e908c662113ad5%40postgresql.orgDiscussion:https://postgr.es/m/18426-2d18da6586f152d6%40postgresql.org
1 parent26a79cb commit2280912

File tree

6 files changed

+92
-33
lines changed

6 files changed

+92
-33
lines changed

‎contrib/pg_visibility/pg_visibility.c

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include"funcapi.h"
1919
#include"miscadmin.h"
2020
#include"storage/bufmgr.h"
21+
#include"storage/proc.h"
2122
#include"storage/procarray.h"
2223
#include"storage/smgr.h"
2324
#include"utils/rel.h"
@@ -385,6 +386,7 @@ pg_truncate_visibility_map(PG_FUNCTION_ARGS)
385386
Relationrel;
386387
ForkNumberfork;
387388
BlockNumberblock;
389+
BlockNumberold_block;
388390

389391
rel=relation_open(relid,AccessExclusiveLock);
390392

@@ -394,15 +396,24 @@ pg_truncate_visibility_map(PG_FUNCTION_ARGS)
394396
/* Forcibly reset cached file size */
395397
RelationGetSmgr(rel)->smgr_vm_nblocks=InvalidBlockNumber;
396398

399+
/* Compute new and old size before entering critical section. */
400+
fork=VISIBILITYMAP_FORKNUM;
397401
block=visibilitymap_prepare_truncate(rel,0);
398-
if (BlockNumberIsValid(block))
399-
{
400-
fork=VISIBILITYMAP_FORKNUM;
401-
smgrtruncate(RelationGetSmgr(rel),&fork,1,&block);
402-
}
402+
old_block=BlockNumberIsValid(block) ?smgrnblocks(RelationGetSmgr(rel),fork) :0;
403+
404+
/*
405+
* WAL-logging, buffer dropping, file truncation must be atomic and all on
406+
* one side of a checkpoint. See RelationTruncate() for discussion.
407+
*/
408+
Assert(!MyProc->delayChkpt);
409+
MyProc->delayChkpt= true;
410+
Assert(!MyProc->delayChkptEnd);
411+
MyProc->delayChkptEnd= true;
412+
START_CRIT_SECTION();
403413

404414
if (RelationNeedsWAL(rel))
405415
{
416+
XLogRecPtrlsn;
406417
xl_smgr_truncatexlrec;
407418

408419
xlrec.blkno=0;
@@ -412,9 +423,18 @@ pg_truncate_visibility_map(PG_FUNCTION_ARGS)
412423
XLogBeginInsert();
413424
XLogRegisterData((char*)&xlrec,sizeof(xlrec));
414425

415-
XLogInsert(RM_SMGR_ID,XLOG_SMGR_TRUNCATE |XLR_SPECIAL_REL_UPDATE);
426+
lsn=XLogInsert(RM_SMGR_ID,
427+
XLOG_SMGR_TRUNCATE |XLR_SPECIAL_REL_UPDATE);
428+
XLogFlush(lsn);
416429
}
417430

431+
if (BlockNumberIsValid(block))
432+
smgrtruncate(RelationGetSmgr(rel),&fork,1,&old_block,&block);
433+
434+
END_CRIT_SECTION();
435+
MyProc->delayChkpt= false;
436+
MyProc->delayChkptEnd= false;
437+
418438
/*
419439
* Release the lock right away, not at commit time.
420440
*

‎src/backend/catalog/storage.c

Lines changed: 32 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,7 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
280280
boolvm;
281281
boolneed_fsm_vacuum= false;
282282
ForkNumberforks[MAX_FORKNUM];
283+
BlockNumberold_blocks[MAX_FORKNUM];
283284
BlockNumberblocks[MAX_FORKNUM];
284285
intnforks=0;
285286
SMgrRelationreln;
@@ -295,6 +296,7 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
295296

296297
/* Prepare for truncation of MAIN fork of the relation */
297298
forks[nforks]=MAIN_FORKNUM;
299+
old_blocks[nforks]=smgrnblocks(reln,MAIN_FORKNUM);
298300
blocks[nforks]=nblocks;
299301
nforks++;
300302

@@ -306,6 +308,7 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
306308
if (BlockNumberIsValid(blocks[nforks]))
307309
{
308310
forks[nforks]=FSM_FORKNUM;
311+
old_blocks[nforks]=smgrnblocks(reln,FSM_FORKNUM);
309312
nforks++;
310313
need_fsm_vacuum= true;
311314
}
@@ -319,6 +322,7 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
319322
if (BlockNumberIsValid(blocks[nforks]))
320323
{
321324
forks[nforks]=VISIBILITYMAP_FORKNUM;
325+
old_blocks[nforks]=smgrnblocks(reln,VISIBILITYMAP_FORKNUM);
322326
nforks++;
323327
}
324328
}
@@ -357,14 +361,20 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
357361
MyProc->delayChkptEnd= true;/* DELAY_CHKPT_COMPLETE */
358362

359363
/*
360-
* We WAL-log the truncation before actually truncating, which means
361-
* trouble if the truncation fails. If we then crash, the WAL replay
362-
* likely isn't going to succeed in the truncation either, and cause a
363-
* PANIC. It's tempting to put a critical section here, but that cure
364-
* would be worse than the disease. It would turn a usually harmless
365-
* failure to truncate, that might spell trouble at WAL replay, into a
366-
* certain PANIC.
364+
* We WAL-log the truncation first and then truncate in a critical
365+
* section. Truncation drops buffers, even if dirty, and then truncates
366+
* disk files. All of that work needs to complete before the lock is
367+
* released, or else old versions of pages on disk that are missing recent
368+
* changes would become accessible again. We'll try the whole operation
369+
* again in crash recovery if we panic, but even then we can't give up
370+
* because we don't want standbys' relation sizes to diverge and break
371+
* replay or visibility invariants downstream. The critical section also
372+
* suppresses interrupts.
373+
*
374+
* (See also pg_visibilitymap.c if changing this code.)
367375
*/
376+
START_CRIT_SECTION();
377+
368378
if (RelationNeedsWAL(rel))
369379
{
370380
/*
@@ -388,18 +398,20 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
388398
* hit the disk before the WAL record, and the truncation of the FSM
389399
* or visibility map. If we crashed during that window, we'd be left
390400
* with a truncated heap, but the FSM or visibility map would still
391-
* contain entries for the non-existent heap pages.
401+
* contain entries for the non-existent heap pages, and standbys would
402+
* also never replay the truncation.
392403
*/
393-
if (fsm||vm)
394-
XLogFlush(lsn);
404+
XLogFlush(lsn);
395405
}
396406

397407
/*
398408
* This will first remove any buffers from the buffer pool that should no
399409
* longer exist after truncation is complete, and then truncate the
400410
* corresponding files on disk.
401411
*/
402-
smgrtruncate(RelationGetSmgr(rel),forks,nforks,blocks);
412+
smgrtruncate(RelationGetSmgr(rel),forks,nforks,old_blocks,blocks);
413+
414+
END_CRIT_SECTION();
403415

404416
/* We've done all the critical work, so checkpoints are OK now. */
405417
MyProc->delayChkpt= false;
@@ -984,6 +996,7 @@ smgr_redo(XLogReaderState *record)
984996
Relationrel;
985997
ForkNumberforks[MAX_FORKNUM];
986998
BlockNumberblocks[MAX_FORKNUM];
999+
BlockNumberold_blocks[MAX_FORKNUM];
9871000
intnforks=0;
9881001
boolneed_fsm_vacuum= false;
9891002

@@ -1018,6 +1031,7 @@ smgr_redo(XLogReaderState *record)
10181031
if ((xlrec->flags&SMGR_TRUNCATE_HEAP)!=0)
10191032
{
10201033
forks[nforks]=MAIN_FORKNUM;
1034+
old_blocks[nforks]=smgrnblocks(reln,MAIN_FORKNUM);
10211035
blocks[nforks]=xlrec->blkno;
10221036
nforks++;
10231037

@@ -1035,6 +1049,7 @@ smgr_redo(XLogReaderState *record)
10351049
if (BlockNumberIsValid(blocks[nforks]))
10361050
{
10371051
forks[nforks]=FSM_FORKNUM;
1052+
old_blocks[nforks]=smgrnblocks(reln,FSM_FORKNUM);
10381053
nforks++;
10391054
need_fsm_vacuum= true;
10401055
}
@@ -1046,13 +1061,18 @@ smgr_redo(XLogReaderState *record)
10461061
if (BlockNumberIsValid(blocks[nforks]))
10471062
{
10481063
forks[nforks]=VISIBILITYMAP_FORKNUM;
1064+
old_blocks[nforks]=smgrnblocks(reln,VISIBILITYMAP_FORKNUM);
10491065
nforks++;
10501066
}
10511067
}
10521068

10531069
/* Do the real work to truncate relation forks */
10541070
if (nforks>0)
1055-
smgrtruncate(reln,forks,nforks,blocks);
1071+
{
1072+
START_CRIT_SECTION();
1073+
smgrtruncate(reln,forks,nforks,old_blocks,blocks);
1074+
END_CRIT_SECTION();
1075+
}
10561076

10571077
/*
10581078
* Update upper-level FSM pages to account for the truncation. This is

‎src/backend/storage/smgr/md.c

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -827,19 +827,21 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum)
827827

828828
/*
829829
*mdtruncate() -- Truncate relation to specified number of blocks.
830+
*
831+
* Guaranteed not to allocate memory, so it can be used in a critical section.
832+
* Caller must have called smgrnblocks() to obtain curnblk while holding a
833+
* sufficient lock to prevent a change in relation size, and not used any smgr
834+
* functions for this relation or handled interrupts in between. This makes
835+
* sure we have opened all active segments, so that truncate loop will get
836+
* them all!
830837
*/
831838
void
832-
mdtruncate(SMgrRelationreln,ForkNumberforknum,BlockNumbernblocks)
839+
mdtruncate(SMgrRelationreln,ForkNumberforknum,
840+
BlockNumbercurnblk,BlockNumbernblocks)
833841
{
834-
BlockNumbercurnblk;
835842
BlockNumberpriorblocks;
836843
intcuropensegs;
837844

838-
/*
839-
* NOTE: mdnblocks makes sure we have opened all active segments, so that
840-
* truncation loop will get them all!
841-
*/
842-
curnblk=mdnblocks(reln,forknum);
843845
if (nblocks>curnblk)
844846
{
845847
/* Bogus request ... but no complaint if InRecovery */
@@ -1108,7 +1110,7 @@ _fdvec_resize(SMgrRelation reln,
11081110
reln->md_seg_fds[forknum]=
11091111
MemoryContextAlloc(MdCxt,sizeof(MdfdVec)*nseg);
11101112
}
1111-
else
1113+
elseif (nseg>reln->md_num_open_segs[forknum])
11121114
{
11131115
/*
11141116
* It doesn't seem worthwhile complicating the code to amortize
@@ -1120,6 +1122,16 @@ _fdvec_resize(SMgrRelation reln,
11201122
repalloc(reln->md_seg_fds[forknum],
11211123
sizeof(MdfdVec)*nseg);
11221124
}
1125+
else
1126+
{
1127+
/*
1128+
* We don't reallocate a smaller array, because we want mdtruncate()
1129+
* to be able to promise that it won't allocate memory, so that it is
1130+
* allowed in a critical section. This means that a bit of space in
1131+
* the array is now wasted, until the next time we add a segment and
1132+
* reallocate.
1133+
*/
1134+
}
11231135

11241136
reln->md_num_open_segs[forknum]=nseg;
11251137
}

‎src/backend/storage/smgr/smgr.c

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ typedef struct f_smgr
5959
BlockNumberblocknum,BlockNumbernblocks);
6060
BlockNumber (*smgr_nblocks) (SMgrRelationreln,ForkNumberforknum);
6161
void(*smgr_truncate) (SMgrRelationreln,ForkNumberforknum,
62-
BlockNumbernblocks);
62+
BlockNumberold_blocks,BlockNumbernblocks);
6363
void(*smgr_immedsync) (SMgrRelationreln,ForkNumberforknum);
6464
}f_smgr;
6565

@@ -548,10 +548,15 @@ smgrnblocks(SMgrRelation reln, ForkNumber forknum)
548548
*
549549
* The caller must hold AccessExclusiveLock on the relation, to ensure that
550550
* other backends receive the smgr invalidation event that this function sends
551-
* before they access any forks of the relation again.
551+
* before they access any forks of the relation again. The current size of
552+
* the forks should be provided in old_nblocks. This function should normally
553+
* be called in a critical section, but the current size must be checked
554+
* outside the critical section, and no interrupts or smgr functions relating
555+
* to this relation should be called in between.
552556
*/
553557
void
554-
smgrtruncate(SMgrRelationreln,ForkNumber*forknum,intnforks,BlockNumber*nblocks)
558+
smgrtruncate(SMgrRelationreln,ForkNumber*forknum,intnforks,
559+
BlockNumber*old_nblocks,BlockNumber*nblocks)
555560
{
556561
inti;
557562

@@ -576,7 +581,8 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nb
576581
/* Do the truncation */
577582
for (i=0;i<nforks;i++)
578583
{
579-
smgrsw[reln->smgr_which].smgr_truncate(reln,forknum[i],nblocks[i]);
584+
smgrsw[reln->smgr_which].smgr_truncate(reln,forknum[i],
585+
old_nblocks[i],nblocks[i]);
580586

581587
/*
582588
* We might as well update the local smgr_fsm_nblocks and

‎src/include/storage/md.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ extern void mdwriteback(SMgrRelation reln, ForkNumber forknum,
3838
BlockNumberblocknum,BlockNumbernblocks);
3939
externBlockNumbermdnblocks(SMgrRelationreln,ForkNumberforknum);
4040
externvoidmdtruncate(SMgrRelationreln,ForkNumberforknum,
41-
BlockNumbernblocks);
41+
BlockNumberold_blocks,BlockNumbernblocks);
4242
externvoidmdimmedsync(SMgrRelationreln,ForkNumberforknum);
4343

4444
externvoidForgetDatabaseSyncRequests(Oiddbid);

‎src/include/storage/smgr.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -101,8 +101,9 @@ extern void smgrwrite(SMgrRelation reln, ForkNumber forknum,
101101
externvoidsmgrwriteback(SMgrRelationreln,ForkNumberforknum,
102102
BlockNumberblocknum,BlockNumbernblocks);
103103
externBlockNumbersmgrnblocks(SMgrRelationreln,ForkNumberforknum);
104-
externvoidsmgrtruncate(SMgrRelationreln,ForkNumber*forknum,
105-
intnforks,BlockNumber*nblocks);
104+
externvoidsmgrtruncate(SMgrRelationreln,ForkNumber*forknum,intnforks,
105+
BlockNumber*old_nblocks,
106+
BlockNumber*nblocks);
106107
externvoidsmgrimmedsync(SMgrRelationreln,ForkNumberforknum);
107108
externvoidAtEOXact_SMgr(void);
108109

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp