Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit49a7610

Browse files
committed
Fix an ancient oversight in btree xlog replay. When trying to determine if an
upper-level insertion completes a previously-seen split, we cannot simply grabthe downlink block number out of the buffer, because the buffer could containa later state of the page --- or perhaps the page doesn't even exist at allany more, due to relation truncation. These possibilities have been masked upto now because the use of full_page_writes effectively ensured that no xlogreplay routine ever actually saw a page state newer than its own change.Since we're deprecating full_page_writes in 8.1.*, there's no need to fix thisin existing release branches, but we need a fix in HEAD if we want to have anyhope of re-allowing full_page_writes. Accordingly, adjust the contents ofbtree WAL records so that we can always get the downlink block number from theWAL record rather than having to depend on buffer contents. Per report fromKevin Grittner and Peter Brant.Improve a few comments in related code while at it.
1 parent3ef151e commit49a7610

File tree

3 files changed

+96
-55
lines changed

3 files changed

+96
-55
lines changed

‎src/backend/access/nbtree/nbtinsert.c

Lines changed: 31 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.134 2006/03/31 23:32:05 tgl Exp $
11+
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.135 2006/04/13 03:53:05 tgl Exp $
1212
*
1313
*-------------------------------------------------------------------------
1414
*/
@@ -323,9 +323,9 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
323323
* child page on the parent.
324324
*+ updates the metapage if a true root or fast root is split.
325325
*
326-
*On entry, we must have the right bufferon which to do the
327-
*insertion, and the buffer must be pinned and locked. On return,
328-
*we will have dropped both the pin and thewritelock on the buffer.
326+
*On entry, we must have the right bufferin which to do the
327+
*insertion, and the buffer must be pinned andwrite-locked. On return,
328+
*we will have dropped both the pin and the lock on the buffer.
329329
*
330330
*If 'afteritem' is >0 then the new tuple must be inserted after the
331331
*existing item of that number, noplace else. If 'afteritem' is 0
@@ -527,6 +527,8 @@ _bt_insertonpg(Relation rel,
527527
*/
528528
if (split_only_page)
529529
{
530+
Assert(!P_ISLEAF(lpageop));
531+
530532
metabuf=_bt_getbuf(rel,BTREE_METAPAGE,BT_WRITE);
531533
metapg=BufferGetPage(metabuf);
532534
metad=BTPageGetMeta(metapg);
@@ -557,10 +559,11 @@ _bt_insertonpg(Relation rel,
557559
if (!rel->rd_istemp)
558560
{
559561
xl_btree_insertxlrec;
562+
BlockNumberxldownlink;
560563
xl_btree_metadataxlmeta;
561564
uint8xlinfo;
562565
XLogRecPtrrecptr;
563-
XLogRecDatardata[3];
566+
XLogRecDatardata[4];
564567
XLogRecData*nextrdata;
565568
IndexTupleDatatrunctuple;
566569

@@ -572,6 +575,22 @@ _bt_insertonpg(Relation rel,
572575
rdata[0].buffer=InvalidBuffer;
573576
rdata[0].next=nextrdata=&(rdata[1]);
574577

578+
if (P_ISLEAF(lpageop))
579+
xlinfo=XLOG_BTREE_INSERT_LEAF;
580+
else
581+
{
582+
xldownlink=ItemPointerGetBlockNumber(&(itup->t_tid));
583+
Assert(ItemPointerGetOffsetNumber(&(itup->t_tid))==P_HIKEY);
584+
585+
nextrdata->data= (char*)&xldownlink;
586+
nextrdata->len=sizeof(BlockNumber);
587+
nextrdata->buffer=InvalidBuffer;
588+
nextrdata->next=nextrdata+1;
589+
nextrdata++;
590+
591+
xlinfo=XLOG_BTREE_INSERT_UPPER;
592+
}
593+
575594
if (BufferIsValid(metabuf))
576595
{
577596
xlmeta.root=metad->btm_root;
@@ -584,12 +603,9 @@ _bt_insertonpg(Relation rel,
584603
nextrdata->buffer=InvalidBuffer;
585604
nextrdata->next=nextrdata+1;
586605
nextrdata++;
606+
587607
xlinfo=XLOG_BTREE_INSERT_META;
588608
}
589-
elseif (P_ISLEAF(lpageop))
590-
xlinfo=XLOG_BTREE_INSERT_LEAF;
591-
else
592-
xlinfo=XLOG_BTREE_INSERT_UPPER;
593609

594610
/* Read comments in _bt_pgaddtup */
595611
if (!P_ISLEAF(lpageop)&&newitemoff==P_FIRSTDATAKEY(lpageop))
@@ -633,7 +649,7 @@ _bt_insertonpg(Relation rel,
633649
/*
634650
*_bt_split() -- split a page in the btree.
635651
*
636-
*On entry, buf is the page to split, and iswrite-locked andpinned.
652+
*On entry, buf is the page to split, and ispinned andwrite-locked.
637653
*firstright is the item index of the first item to be moved to the
638654
*new right page. newitemoff etc. tell us about the new item that
639655
*must be inserted along with the data from the old page.
@@ -860,7 +876,8 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
860876
* Direct access to page is not good but faster - we should implement
861877
* some new func in page API. Note we only store the tuples
862878
* themselves, knowing that the item pointers are in the same order
863-
* and can be reconstructed by scanning the tuples.
879+
* and can be reconstructed by scanning the tuples. See comments
880+
* for _bt_restore_page().
864881
*/
865882
xlrec.leftlen= ((PageHeader)leftpage)->pd_special-
866883
((PageHeader)leftpage)->pd_upper;
@@ -1445,6 +1462,9 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
14451462
* Insert the left page pointer into the new root page. The root page is
14461463
* the rightmost page on its level so there is no "high key" in it; the
14471464
* two items will go into positions P_HIKEY and P_FIRSTKEY.
1465+
*
1466+
* Note: we *must* insert the two items in item-number order, for the
1467+
* benefit of _bt_restore_page().
14481468
*/
14491469
if (PageAddItem(rootpage, (Item)new_item,itemsz,P_HIKEY,LP_USED)==InvalidOffsetNumber)
14501470
elog(PANIC,"failed to add leftkey to new root page");

‎src/backend/access/nbtree/nbtxlog.c

Lines changed: 63 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
* Portions Copyright (c) 1994, Regents of the University of California
99
*
1010
* IDENTIFICATION
11-
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.31 2006/04/01 03:03:37 tgl Exp $
11+
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.32 2006/04/13 03:53:05 tgl Exp $
1212
*
1313
*-------------------------------------------------------------------------
1414
*/
@@ -51,32 +51,16 @@ log_incomplete_split(RelFileNode node, BlockNumber leftblk,
5151
}
5252

5353
staticvoid
54-
forget_matching_split(Relationreln,RelFileNodenode,
55-
BlockNumberinsertblk,OffsetNumberoffnum,
56-
boolis_root)
54+
forget_matching_split(RelFileNodenode,BlockNumberdownlink,boolis_root)
5755
{
58-
Bufferbuffer;
59-
Pagepage;
60-
IndexTupleitup;
61-
BlockNumberrightblk;
6256
ListCell*l;
6357

64-
/* Get downlink TID from page */
65-
buffer=XLogReadBuffer(reln,insertblk, false);
66-
if (!BufferIsValid(buffer))
67-
return;
68-
page= (Page)BufferGetPage(buffer);
69-
itup= (IndexTuple)PageGetItem(page,PageGetItemId(page,offnum));
70-
rightblk=ItemPointerGetBlockNumber(&(itup->t_tid));
71-
Assert(ItemPointerGetOffsetNumber(&(itup->t_tid))==P_HIKEY);
72-
UnlockReleaseBuffer(buffer);
73-
7458
foreach(l,incomplete_splits)
7559
{
7660
bt_incomplete_split*split= (bt_incomplete_split*)lfirst(l);
7761

7862
if (RelFileNodeEquals(node,split->node)&&
79-
rightblk==split->rightblk)
63+
downlink==split->rightblk)
8064
{
8165
if (is_root!=split->is_root)
8266
elog(LOG,"forget_matching_split: fishy is_root data (expected %d, got %d)",
@@ -87,6 +71,20 @@ forget_matching_split(Relation reln, RelFileNode node,
8771
}
8872
}
8973

74+
/*
75+
* _bt_restore_page -- re-enter all the index tuples on a page
76+
*
77+
* The page is freshly init'd, and *from (length len) is a copy of what
78+
* had been its upper part (pd_upper to pd_special). We assume that the
79+
* tuples had been added to the page in item-number order, and therefore
80+
* the one with highest item number appears first (lowest on the page).
81+
*
82+
* NOTE: the way this routine is coded, the rebuilt page will have the items
83+
* in correct itemno sequence, but physically the opposite order from the
84+
* original, because we insert them in the opposite of itemno order. This
85+
* does not matter in any current btree code, but it's something to keep an
86+
* eye on. Is it worth changing just on general principles?
87+
*/
9088
staticvoid
9189
_bt_restore_page(Pagepage,char*from,intlen)
9290
{
@@ -158,18 +156,24 @@ btree_xlog_insert(bool isleaf, bool ismeta,
158156
char*datapos;
159157
intdatalen;
160158
xl_btree_metadatamd;
159+
BlockNumberdownlink=0;
161160

162161
datapos= (char*)xlrec+SizeOfBtreeInsert;
163162
datalen=record->xl_len-SizeOfBtreeInsert;
163+
if (!isleaf)
164+
{
165+
memcpy(&downlink,datapos,sizeof(BlockNumber));
166+
datapos+=sizeof(BlockNumber);
167+
datalen-=sizeof(BlockNumber);
168+
}
164169
if (ismeta)
165170
{
166171
memcpy(&md,datapos,sizeof(xl_btree_metadata));
167172
datapos+=sizeof(xl_btree_metadata);
168173
datalen-=sizeof(xl_btree_metadata);
169174
}
170175

171-
if ((record->xl_info&XLR_BKP_BLOCK_1)&& !ismeta&&
172-
incomplete_splits==NIL)
176+
if ((record->xl_info&XLR_BKP_BLOCK_1)&& !ismeta&&isleaf)
173177
return;/* nothing to do */
174178

175179
reln=XLogOpenRelation(xlrec->target.node);
@@ -208,13 +212,8 @@ btree_xlog_insert(bool isleaf, bool ismeta,
208212
md.fastroot,md.fastlevel);
209213

210214
/* Forget any split this insertion completes */
211-
if (!isleaf&&incomplete_splits!=NIL)
212-
{
213-
forget_matching_split(reln,xlrec->target.node,
214-
ItemPointerGetBlockNumber(&(xlrec->target.tid)),
215-
ItemPointerGetOffsetNumber(&(xlrec->target.tid)),
216-
false);
217-
}
215+
if (!isleaf)
216+
forget_matching_split(xlrec->target.node,downlink, false);
218217
}
219218

220219
staticvoid
@@ -224,14 +223,17 @@ btree_xlog_split(bool onleft, bool isroot,
224223
xl_btree_split*xlrec= (xl_btree_split*)XLogRecGetData(record);
225224
Relationreln;
226225
BlockNumbertargetblk;
226+
OffsetNumbertargetoff;
227227
BlockNumberleftsib;
228228
BlockNumberrightsib;
229+
BlockNumberdownlink=0;
229230
Bufferbuffer;
230231
Pagepage;
231232
BTPageOpaquepageop;
232233

233234
reln=XLogOpenRelation(xlrec->target.node);
234235
targetblk=ItemPointerGetBlockNumber(&(xlrec->target.tid));
236+
targetoff=ItemPointerGetOffsetNumber(&(xlrec->target.tid));
235237
leftsib= (onleft) ?targetblk :xlrec->otherblk;
236238
rightsib= (onleft) ?xlrec->otherblk :targetblk;
237239

@@ -252,6 +254,16 @@ btree_xlog_split(bool onleft, bool isroot,
252254
(char*)xlrec+SizeOfBtreeSplit,
253255
xlrec->leftlen);
254256

257+
if (onleft&&xlrec->level>0)
258+
{
259+
IndexTupleitup;
260+
261+
/* extract downlink in the target tuple */
262+
itup= (IndexTuple)PageGetItem(page,PageGetItemId(page,targetoff));
263+
downlink=ItemPointerGetBlockNumber(&(itup->t_tid));
264+
Assert(ItemPointerGetOffsetNumber(&(itup->t_tid))==P_HIKEY);
265+
}
266+
255267
PageSetLSN(page,lsn);
256268
PageSetTLI(page,ThisTimeLineID);
257269
MarkBufferDirty(buffer);
@@ -274,6 +286,16 @@ btree_xlog_split(bool onleft, bool isroot,
274286
(char*)xlrec+SizeOfBtreeSplit+xlrec->leftlen,
275287
record->xl_len-SizeOfBtreeSplit-xlrec->leftlen);
276288

289+
if (!onleft&&xlrec->level>0)
290+
{
291+
IndexTupleitup;
292+
293+
/* extract downlink in the target tuple */
294+
itup= (IndexTuple)PageGetItem(page,PageGetItemId(page,targetoff));
295+
downlink=ItemPointerGetBlockNumber(&(itup->t_tid));
296+
Assert(ItemPointerGetOffsetNumber(&(itup->t_tid))==P_HIKEY);
297+
}
298+
277299
PageSetLSN(page,lsn);
278300
PageSetTLI(page,ThisTimeLineID);
279301
MarkBufferDirty(buffer);
@@ -308,13 +330,8 @@ btree_xlog_split(bool onleft, bool isroot,
308330
}
309331

310332
/* Forget any split this insertion completes */
311-
if (xlrec->level>0&&incomplete_splits!=NIL)
312-
{
313-
forget_matching_split(reln,xlrec->target.node,
314-
ItemPointerGetBlockNumber(&(xlrec->target.tid)),
315-
ItemPointerGetOffsetNumber(&(xlrec->target.tid)),
316-
false);
317-
}
333+
if (xlrec->level>0)
334+
forget_matching_split(xlrec->target.node,downlink, false);
318335

319336
/* The job ain't done till the parent link is inserted... */
320337
log_incomplete_split(xlrec->target.node,
@@ -516,6 +533,7 @@ btree_xlog_newroot(XLogRecPtr lsn, XLogRecord *record)
516533
Bufferbuffer;
517534
Pagepage;
518535
BTPageOpaquepageop;
536+
BlockNumberdownlink=0;
519537

520538
reln=XLogOpenRelation(xlrec->node);
521539
buffer=XLogReadBuffer(reln,xlrec->rootblk, true);
@@ -532,9 +550,17 @@ btree_xlog_newroot(XLogRecPtr lsn, XLogRecord *record)
532550
pageop->btpo_flags |=BTP_LEAF;
533551

534552
if (record->xl_len>SizeOfBtreeNewroot)
553+
{
554+
IndexTupleitup;
555+
535556
_bt_restore_page(page,
536557
(char*)xlrec+SizeOfBtreeNewroot,
537558
record->xl_len-SizeOfBtreeNewroot);
559+
/* extract downlink to the right-hand split page */
560+
itup= (IndexTuple)PageGetItem(page,PageGetItemId(page,P_FIRSTKEY));
561+
downlink=ItemPointerGetBlockNumber(&(itup->t_tid));
562+
Assert(ItemPointerGetOffsetNumber(&(itup->t_tid))==P_HIKEY);
563+
}
538564

539565
PageSetLSN(page,lsn);
540566
PageSetTLI(page,ThisTimeLineID);
@@ -546,14 +572,8 @@ btree_xlog_newroot(XLogRecPtr lsn, XLogRecord *record)
546572
xlrec->rootblk,xlrec->level);
547573

548574
/* Check to see if this satisfies any incomplete insertions */
549-
if (record->xl_len>SizeOfBtreeNewroot&&
550-
incomplete_splits!=NIL)
551-
{
552-
forget_matching_split(reln,xlrec->node,
553-
xlrec->rootblk,
554-
P_FIRSTKEY,
555-
true);
556-
}
575+
if (record->xl_len>SizeOfBtreeNewroot)
576+
forget_matching_split(xlrec->node,downlink, true);
557577
}
558578

559579

‎src/include/access/nbtree.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
* Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
88
* Portions Copyright (c) 1994, Regents of the University of California
99
*
10-
* $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.95 2006/04/01 03:03:37 tgl Exp $
10+
* $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.96 2006/04/13 03:53:05 tgl Exp $
1111
*
1212
*-------------------------------------------------------------------------
1313
*/
@@ -206,6 +206,7 @@ typedef struct xl_btree_metadata
206206
typedefstructxl_btree_insert
207207
{
208208
xl_btreetidtarget;/* inserted tuple id */
209+
/* BlockNumber downlink field FOLLOWS IF NOT XLOG_BTREE_INSERT_LEAF */
209210
/* xl_btree_metadata FOLLOWS IF XLOG_BTREE_INSERT_META */
210211
/* INDEX TUPLE FOLLOWS AT END OF STRUCT */
211212
}xl_btree_insert;

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp