Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit1ce14b6

Browse files
committed
Fix possible recovery trouble if TRUNCATE overlaps a checkpoint.
If TRUNCATE causes some buffers to be invalidated and thus thecheckpoint does not flush them, TRUNCATE must also ensure that thecorresponding files are truncated on disk. Otherwise, a replayfrom the checkpoint might find that the buffers exist but havethe wrong contents, which may cause replay to fail.Report by Teja Mupparti. Patch by Kyotaro Horiguchi, per a designsuggestion from Heikki Linnakangas, with some changes to thecomments by me. Review of this and a prior patch that approachedthe issue differently by Heikki Linnakangas, Andres Freund, ÁlvaroHerrera, Masahiko Sawada, and Tom Lane.Discussion:http://postgr.es/m/BYAPR06MB6373BF50B469CA393C614257ABF00@BYAPR06MB6373.namprd06.prod.outlook.com
1 parentc0f99bb commit1ce14b6

File tree

11 files changed

+120
-28
lines changed

11 files changed

+120
-28
lines changed

‎src/backend/access/transam/multixact.c‎

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3071,8 +3071,8 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB)
30713071
* crash/basebackup, even though the state of the data directory would
30723072
* require it.
30733073
*/
3074-
Assert(!MyProc->delayChkpt);
3075-
MyProc->delayChkpt= true;
3074+
Assert((MyProc->delayChkpt&DELAY_CHKPT_START)==0);
3075+
MyProc->delayChkpt|=DELAY_CHKPT_START;
30763076

30773077
/* WAL log truncation */
30783078
WriteMTruncateXlogRec(newOldestMultiDB,
@@ -3098,7 +3098,7 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB)
30983098
/* Then offsets */
30993099
PerformOffsetsTruncation(oldestMulti,newOldestMulti);
31003100

3101-
MyProc->delayChkpt= false;
3101+
MyProc->delayChkpt&= ~DELAY_CHKPT_START;
31023102

31033103
END_CRIT_SECTION();
31043104
LWLockRelease(MultiXactTruncationLock);

‎src/backend/access/transam/twophase.c‎

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -476,7 +476,7 @@ MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, const char *gid,
476476
}
477477
pgxact->xid=xid;
478478
pgxact->xmin=InvalidTransactionId;
479-
proc->delayChkpt=false;
479+
proc->delayChkpt=0;
480480
pgxact->vacuumFlags=0;
481481
proc->pid=0;
482482
proc->databaseId=databaseid;
@@ -1170,7 +1170,8 @@ EndPrepare(GlobalTransaction gxact)
11701170

11711171
START_CRIT_SECTION();
11721172

1173-
MyProc->delayChkpt= true;
1173+
Assert((MyProc->delayChkpt&DELAY_CHKPT_START)==0);
1174+
MyProc->delayChkpt |=DELAY_CHKPT_START;
11741175

11751176
XLogBeginInsert();
11761177
for (record=records.head;record!=NULL;record=record->next)
@@ -1213,7 +1214,7 @@ EndPrepare(GlobalTransaction gxact)
12131214
* checkpoint starting after this will certainly see the gxact as a
12141215
* candidate for fsyncing.
12151216
*/
1216-
MyProc->delayChkpt= false;
1217+
MyProc->delayChkpt&= ~DELAY_CHKPT_START;
12171218

12181219
/*
12191220
* Remember that we have this GlobalTransaction entry locked for us. If
@@ -2286,7 +2287,8 @@ RecordTransactionCommitPrepared(TransactionId xid,
22862287
START_CRIT_SECTION();
22872288

22882289
/* See notes in RecordTransactionCommit */
2289-
MyProc->delayChkpt= true;
2290+
Assert((MyProc->delayChkpt&DELAY_CHKPT_START)==0);
2291+
MyProc->delayChkpt |=DELAY_CHKPT_START;
22902292

22912293
/*
22922294
* Emit the XLOG commit record. Note that we mark 2PC commits as
@@ -2334,7 +2336,7 @@ RecordTransactionCommitPrepared(TransactionId xid,
23342336
TransactionIdCommitTree(xid,nchildren,children);
23352337

23362338
/* Checkpoint can proceed now */
2337-
MyProc->delayChkpt= false;
2339+
MyProc->delayChkpt&= ~DELAY_CHKPT_START;
23382340

23392341
END_CRIT_SECTION();
23402342

‎src/backend/access/transam/xact.c‎

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1308,8 +1308,9 @@ RecordTransactionCommit(void)
13081308
* This makes checkpoint's determination of which xacts are delayChkpt
13091309
* a bit fuzzy, but it doesn't matter.
13101310
*/
1311+
Assert((MyProc->delayChkpt&DELAY_CHKPT_START)==0);
13111312
START_CRIT_SECTION();
1312-
MyProc->delayChkpt= true;
1313+
MyProc->delayChkpt|=DELAY_CHKPT_START;
13131314

13141315
SetCurrentTransactionStopTimestamp();
13151316

@@ -1410,7 +1411,7 @@ RecordTransactionCommit(void)
14101411
*/
14111412
if (markXidCommitted)
14121413
{
1413-
MyProc->delayChkpt= false;
1414+
MyProc->delayChkpt&= ~DELAY_CHKPT_START;
14141415
END_CRIT_SECTION();
14151416
}
14161417

‎src/backend/access/transam/xlog.c‎

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9022,18 +9022,30 @@ CreateCheckPoint(int flags)
90229022
* and we will correctly flush the update below. So we cannot miss any
90239023
* xacts we need to wait for.
90249024
*/
9025-
vxids=GetVirtualXIDsDelayingChkpt(&nvxids);
9025+
vxids=GetVirtualXIDsDelayingChkpt(&nvxids,DELAY_CHKPT_START);
90269026
if (nvxids>0)
90279027
{
90289028
do
90299029
{
90309030
pg_usleep(10000L);/* wait for 10 msec */
9031-
}while (HaveVirtualXIDsDelayingChkpt(vxids,nvxids));
9031+
}while (HaveVirtualXIDsDelayingChkpt(vxids,nvxids,
9032+
DELAY_CHKPT_START));
90329033
}
90339034
pfree(vxids);
90349035

90359036
CheckPointGuts(checkPoint.redo,flags);
90369037

9038+
vxids=GetVirtualXIDsDelayingChkpt(&nvxids,DELAY_CHKPT_COMPLETE);
9039+
if (nvxids>0)
9040+
{
9041+
do
9042+
{
9043+
pg_usleep(10000L);/* wait for 10 msec */
9044+
}while (HaveVirtualXIDsDelayingChkpt(vxids,nvxids,
9045+
DELAY_CHKPT_COMPLETE));
9046+
}
9047+
pfree(vxids);
9048+
90379049
/*
90389050
* Take a snapshot of running transactions and write this to WAL. This
90399051
* allows us to reconstruct the state of running transactions during

‎src/backend/access/transam/xloginsert.c‎

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -904,7 +904,7 @@ XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
904904
/*
905905
* Ensure no checkpoint can change our view of RedoRecPtr.
906906
*/
907-
Assert(MyProc->delayChkpt);
907+
Assert((MyProc->delayChkpt&DELAY_CHKPT_START)!=0);
908908

909909
/*
910910
* Update RedoRecPtr so that we can make the right decision

‎src/backend/catalog/storage.c‎

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -325,6 +325,22 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
325325

326326
RelationPreTruncate(rel);
327327

328+
/*
329+
* Make sure that a concurrent checkpoint can't complete while truncation
330+
* is in progress.
331+
*
332+
* The truncation operation might drop buffers that the checkpoint
333+
* otherwise would have flushed. If it does, then it's essential that
334+
* the files actually get truncated on disk before the checkpoint record
335+
* is written. Otherwise, if reply begins from that checkpoint, the
336+
* to-be-truncated blocks might still exist on disk but have older
337+
* contents than expected, which can cause replay to fail. It's OK for
338+
* the blocks to not exist on disk at all, but not for them to have the
339+
* wrong contents.
340+
*/
341+
Assert((MyProc->delayChkpt&DELAY_CHKPT_COMPLETE)==0);
342+
MyProc->delayChkpt |=DELAY_CHKPT_COMPLETE;
343+
328344
/*
329345
* We WAL-log the truncation before actually truncating, which means
330346
* trouble if the truncation fails. If we then crash, the WAL replay
@@ -363,13 +379,24 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
363379
XLogFlush(lsn);
364380
}
365381

366-
/* Do the real work to truncate relation forks */
382+
/*
383+
* This will first remove any buffers from the buffer pool that should no
384+
* longer exist after truncation is complete, and then truncate the
385+
* corresponding files on disk.
386+
*/
367387
smgrtruncate(rel->rd_smgr,forks,nforks,blocks);
368388

389+
/* We've done all the critical work, so checkpoints are OK now. */
390+
MyProc->delayChkpt &= ~DELAY_CHKPT_COMPLETE;
391+
369392
/*
370393
* Update upper-level FSM pages to account for the truncation. This is
371394
* important because the just-truncated pages were likely marked as
372395
* all-free, and would be preferentially selected.
396+
*
397+
* NB: There's no point in delaying checkpoints until this is done.
398+
* Because the FSM is not WAL-logged, we have to be prepared for the
399+
* possibility of corruption after a crash anyway.
373400
*/
374401
if (need_fsm_vacuum)
375402
FreeSpaceMapVacuumRange(rel,nblocks,InvalidBlockNumber);

‎src/backend/storage/buffer/bufmgr.c‎

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3647,7 +3647,9 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
36473647
* essential that CreateCheckpoint waits for virtual transactions
36483648
* rather than full transactionids.
36493649
*/
3650-
MyProc->delayChkpt=delayChkpt= true;
3650+
Assert((MyProc->delayChkpt&DELAY_CHKPT_START)==0);
3651+
MyProc->delayChkpt |=DELAY_CHKPT_START;
3652+
delayChkpt= true;
36513653
lsn=XLogSaveBufferForHint(buffer,buffer_std);
36523654
}
36533655

@@ -3680,7 +3682,7 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
36803682
UnlockBufHdr(bufHdr,buf_state);
36813683

36823684
if (delayChkpt)
3683-
MyProc->delayChkpt= false;
3685+
MyProc->delayChkpt&= ~DELAY_CHKPT_START;
36843686

36853687
if (dirtied)
36863688
{

‎src/backend/storage/ipc/procarray.c‎

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -434,7 +434,10 @@ ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid)
434434
pgxact->xmin=InvalidTransactionId;
435435
/* must be cleared with xid/xmin: */
436436
pgxact->vacuumFlags &= ~PROC_VACUUM_STATE_MASK;
437-
proc->delayChkpt= false;/* be sure this is cleared in abort */
437+
438+
/* be sure this is cleared in abort */
439+
proc->delayChkpt=0;
440+
438441
proc->recoveryConflictPending= false;
439442

440443
Assert(pgxact->nxids==0);
@@ -456,7 +459,10 @@ ProcArrayEndTransactionInternal(PGPROC *proc, PGXACT *pgxact,
456459
pgxact->xmin=InvalidTransactionId;
457460
/* must be cleared with xid/xmin: */
458461
pgxact->vacuumFlags &= ~PROC_VACUUM_STATE_MASK;
459-
proc->delayChkpt= false;/* be sure this is cleared in abort */
462+
463+
/* be sure this is cleared in abort */
464+
proc->delayChkpt=0;
465+
460466
proc->recoveryConflictPending= false;
461467

462468
/* Clear the subtransaction-XID cache too while holding the lock */
@@ -2272,7 +2278,8 @@ GetOldestSafeDecodingTransactionId(bool catalogOnly)
22722278
* delaying checkpoint because they have critical actions in progress.
22732279
*
22742280
* Constructs an array of VXIDs of transactions that are currently in commit
2275-
* critical sections, as shown by having delayChkpt set in their PGPROC.
2281+
* critical sections, as shown by having specified delayChkpt bits set in their
2282+
* PGPROC.
22762283
*
22772284
* Returns a palloc'd array that should be freed by the caller.
22782285
* *nvxids is the number of valid entries.
@@ -2286,13 +2293,15 @@ GetOldestSafeDecodingTransactionId(bool catalogOnly)
22862293
* for clearing of delayChkpt to propagate is unimportant for correctness.
22872294
*/
22882295
VirtualTransactionId*
2289-
GetVirtualXIDsDelayingChkpt(int*nvxids)
2296+
GetVirtualXIDsDelayingChkpt(int*nvxids,inttype)
22902297
{
22912298
VirtualTransactionId*vxids;
22922299
ProcArrayStruct*arrayP=procArray;
22932300
intcount=0;
22942301
intindex;
22952302

2303+
Assert(type!=0);
2304+
22962305
/* allocate what's certainly enough result space */
22972306
vxids= (VirtualTransactionId*)
22982307
palloc(sizeof(VirtualTransactionId)*arrayP->maxProcs);
@@ -2304,7 +2313,7 @@ GetVirtualXIDsDelayingChkpt(int *nvxids)
23042313
intpgprocno=arrayP->pgprocnos[index];
23052314
PGPROC*proc=&allProcs[pgprocno];
23062315

2307-
if (proc->delayChkpt)
2316+
if ((proc->delayChkpt&type)!=0)
23082317
{
23092318
VirtualTransactionIdvxid;
23102319

@@ -2330,12 +2339,14 @@ GetVirtualXIDsDelayingChkpt(int *nvxids)
23302339
* those numbers should be small enough for it not to be a problem.
23312340
*/
23322341
bool
2333-
HaveVirtualXIDsDelayingChkpt(VirtualTransactionId*vxids,intnvxids)
2342+
HaveVirtualXIDsDelayingChkpt(VirtualTransactionId*vxids,intnvxids,inttype)
23342343
{
23352344
boolresult= false;
23362345
ProcArrayStruct*arrayP=procArray;
23372346
intindex;
23382347

2348+
Assert(type!=0);
2349+
23392350
LWLockAcquire(ProcArrayLock,LW_SHARED);
23402351

23412352
for (index=0;index<arrayP->numProcs;index++)
@@ -2346,7 +2357,8 @@ HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids)
23462357

23472358
GET_VXID_FROM_PGPROC(vxid,*proc);
23482359

2349-
if (proc->delayChkpt&&VirtualTransactionIdIsValid(vxid))
2360+
if ((proc->delayChkpt&type)!=0&&
2361+
VirtualTransactionIdIsValid(vxid))
23502362
{
23512363
inti;
23522364

‎src/backend/storage/lmgr/proc.c‎

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -396,7 +396,7 @@ InitProcess(void)
396396
MyProc->roleId=InvalidOid;
397397
MyProc->tempNamespaceId=InvalidOid;
398398
MyProc->isBackgroundWorker=IsBackgroundWorker;
399-
MyProc->delayChkpt=false;
399+
MyProc->delayChkpt=0;
400400
MyPgXact->vacuumFlags=0;
401401
/* NB -- autovac launcher intentionally does not set IS_AUTOVACUUM */
402402
if (IsAutoVacuumWorkerProcess())
@@ -578,7 +578,7 @@ InitAuxiliaryProcess(void)
578578
MyProc->roleId=InvalidOid;
579579
MyProc->tempNamespaceId=InvalidOid;
580580
MyProc->isBackgroundWorker=IsBackgroundWorker;
581-
MyProc->delayChkpt=false;
581+
MyProc->delayChkpt=0;
582582
MyPgXact->vacuumFlags=0;
583583
MyProc->lwWaiting= false;
584584
MyProc->lwWaitMode=0;

‎src/include/storage/proc.h‎

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,41 @@ struct XidCache
8383
*/
8484
#defineINVALID_PGPROCNOPG_INT32_MAX
8585

86+
/*
87+
* Flags for PGPROC.delayChkpt
88+
*
89+
* These flags can be used to delay the start or completion of a checkpoint
90+
* for short periods. A flag is in effect if the corresponding bit is set in
91+
* the PGPROC of any backend.
92+
*
93+
* For our purposes here, a checkpoint has three phases: (1) determine the
94+
* location to which the redo pointer will be moved, (2) write all the
95+
* data durably to disk, and (3) WAL-log the checkpoint.
96+
*
97+
* Setting DELAY_CHKPT_START prevents the system from moving from phase 1
98+
* to phase 2. This is useful when we are performing a WAL-logged modification
99+
* of data that will be flushed to disk in phase 2. By setting this flag
100+
* before writing WAL and clearing it after we've both written WAL and
101+
* performed the corresponding modification, we ensure that if the WAL record
102+
* is inserted prior to the new redo point, the corresponding data changes will
103+
* also be flushed to disk before the checkpoint can complete. (In the
104+
* extremely common case where the data being modified is in shared buffers
105+
* and we acquire an exclusive content lock on the relevant buffers before
106+
* writing WAL, this mechanism is not needed, because phase 2 will block
107+
* until we release the content lock and then flush the modified data to
108+
* disk.)
109+
*
110+
* Setting DELAY_CHKPT_COMPLETE prevents the system from moving from phase 2
111+
* to phase 3. This is useful if we are performing a WAL-logged operation that
112+
* might invalidate buffers, such as relation truncation. In this case, we need
113+
* to ensure that any buffers which were invalidated and thus not flushed by
114+
* the checkpoint are actaully destroyed on disk. Replay can cope with a file
115+
* or block that doesn't exist, but not with a block that has the wrong
116+
* contents.
117+
*/
118+
#defineDELAY_CHKPT_START(1<<0)
119+
#defineDELAY_CHKPT_COMPLETE(1<<1)
120+
86121
/*
87122
* Each backend has a PGPROC struct in shared memory. There is also a list of
88123
* currently-unused PGPROC structs that will be reallocated to new backends.
@@ -149,7 +184,7 @@ struct PGPROC
149184
LOCKMASKheldLocks;/* bitmask for lock types already held on this
150185
* lock object by this backend */
151186

152-
booldelayChkpt;/*true if this proc delays checkpoint start */
187+
intdelayChkpt;/*for DELAY_CHKPT_* flags */
153188

154189
/*
155190
* Info to allow us to wait for synchronous replication, if needed.

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp