Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit9b42e71

Browse files
Don't leave behind junk nbtree pages during split.
Commit8fa30f9 reduced the elevel of a number of "can't happen"_bt_split() errors from PANIC to ERROR. At the same time, the new rightpage buffer for the split could continue to be acquired well before thecritical section. This was possible because it was relativelystraightforward to make sure that _bt_split() could not throw an error,with a few specific exceptions. The exceptional cases were safe becausethey involved specific, well understood errors, making it possible toconsistently zero the right page before actually raising an error usingelog(). There was no danger of leaving around a junk page, provided_bt_split() stuck to this coding rule.Commit8224de4, which introduced INCLUDE indexes, added code to make_bt_split() truncate away non-key attributes. This happened at a pointthat broke the rule around zeroing the right page in _bt_split(). Iftruncation failed (perhaps due to palloc() failure), that would resultin an errant right page buffer with junk contents. This could confuseVACUUM when it attempted to delete the page, and should be avoided ongeneral principle.To fix, reorganize _bt_split() so that truncation occurs before the newright page buffer is even acquired. A junk page/buffer will not be leftbehind if _bt_nonkey_truncate()/_bt_truncate() raise an error.Discussion:https://postgr.es/m/CAH2-WzkcWT_-NH7EeL=Az4efg0KCV+wArygW8zKB=+HoP=VWMw@mail.gmail.comBackpatch: 11-, where INCLUDE indexes were introduced.
1 parent221b377 commit9b42e71

File tree

1 file changed

+126
-87
lines changed

1 file changed

+126
-87
lines changed

‎src/backend/access/nbtree/nbtinsert.c

Lines changed: 126 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,8 @@ static void _bt_insertonpg(Relation rel, BTScanInsert itup_key,
4949
OffsetNumbernewitemoff,
5050
boolsplit_only_page);
5151
staticBuffer_bt_split(Relationrel,BTScanInsertitup_key,Bufferbuf,
52-
Buffercbuf,OffsetNumberfirstright,OffsetNumbernewitemoff,
53-
Sizenewitemsz,IndexTuplenewitem,boolnewitemonleft);
52+
Buffercbuf,OffsetNumbernewitemoff,Sizenewitemsz,
53+
IndexTuplenewitem);
5454
staticvoid_bt_insert_parent(Relationrel,Bufferbuf,Bufferrbuf,
5555
BTStackstack,boolis_root,boolis_only);
5656
staticbool_bt_pgaddtup(Pagepage,Sizeitemsize,IndexTupleitup,
@@ -943,7 +943,6 @@ _bt_insertonpg(Relation rel,
943943
{
944944
Pagepage;
945945
BTPageOpaquelpageop;
946-
OffsetNumberfirstright=InvalidOffsetNumber;
947946
Sizeitemsz;
948947

949948
page=BufferGetPage(buf);
@@ -979,7 +978,6 @@ _bt_insertonpg(Relation rel,
979978
{
980979
boolis_root=P_ISROOT(lpageop);
981980
boolis_only=P_LEFTMOST(lpageop)&&P_RIGHTMOST(lpageop);
982-
boolnewitemonleft;
983981
Bufferrbuf;
984982

985983
/*
@@ -1000,14 +998,8 @@ _bt_insertonpg(Relation rel,
1000998
Assert(!(P_ISLEAF(lpageop)&&
1001999
BlockNumberIsValid(RelationGetTargetBlock(rel))));
10021000

1003-
/* Choose the split point */
1004-
firstright=_bt_findsplitloc(rel,page,
1005-
newitemoff,itemsz,itup,
1006-
&newitemonleft);
1007-
10081001
/* split the buffer into left and right halves */
1009-
rbuf=_bt_split(rel,itup_key,buf,cbuf,firstright,newitemoff,
1010-
itemsz,itup,newitemonleft);
1002+
rbuf=_bt_split(rel,itup_key,buf,cbuf,newitemoff,itemsz,itup);
10111003
PredicateLockPageSplit(rel,
10121004
BufferGetBlockNumber(buf),
10131005
BufferGetBlockNumber(rbuf));
@@ -1211,9 +1203,8 @@ _bt_insertonpg(Relation rel,
12111203
*_bt_split() -- split a page in the btree.
12121204
*
12131205
*On entry, buf is the page to split, and is pinned and write-locked.
1214-
*firstright is the item index of the first item to be moved to the
1215-
*new right page. newitemoff etc. tell us about the new item that
1216-
*must be inserted along with the data from the old page.
1206+
*newitemoff etc. tell us about the new item that must be inserted
1207+
*along with the data from the original page.
12171208
*
12181209
*itup_key is used for suffix truncation on leaf pages (internal
12191210
*page callers pass NULL). When splitting a non-leaf page, 'cbuf'
@@ -1226,8 +1217,7 @@ _bt_insertonpg(Relation rel,
12261217
*/
12271218
staticBuffer
12281219
_bt_split(Relationrel,BTScanInsertitup_key,Bufferbuf,Buffercbuf,
1229-
OffsetNumberfirstright,OffsetNumbernewitemoff,Sizenewitemsz,
1230-
IndexTuplenewitem,boolnewitemonleft)
1220+
OffsetNumbernewitemoff,Sizenewitemsz,IndexTuplenewitem)
12311221
{
12321222
Bufferrbuf;
12331223
Pageorigpage;
@@ -1246,99 +1236,80 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
12461236
IndexTupleitem;
12471237
OffsetNumberleftoff,
12481238
rightoff;
1239+
OffsetNumberfirstright;
12491240
OffsetNumbermaxoff;
12501241
OffsetNumberi;
1251-
boolisleaf;
1242+
boolnewitemonleft,
1243+
isleaf;
12521244
IndexTuplelefthikey;
12531245
intindnatts=IndexRelationGetNumberOfAttributes(rel);
12541246
intindnkeyatts=IndexRelationGetNumberOfKeyAttributes(rel);
12551247

1256-
/* Acquire a new page to split into */
1257-
rbuf=_bt_getbuf(rel,P_NEW,BT_WRITE);
1258-
12591248
/*
12601249
* origpage is the original page to be split. leftpage is a temporary
12611250
* buffer that receives the left-sibling data, which will be copied back
1262-
* into origpage on success. rightpage is the new page that receives the
1263-
* right-sibling data. If we fail before reaching the critical section,
1264-
* origpage hasn't been modified and leftpage is only workspace. In
1265-
* principle we shouldn't need to worry about rightpage either, because it
1266-
* hasn't been linked into the btree page structure; but to avoid leaving
1267-
* possibly-confusing junk behind, we are careful to rewrite rightpage as
1268-
* zeroes before throwing any error.
1251+
* into origpage on success. rightpage is the new page that will receive
1252+
* the right-sibling data.
1253+
*
1254+
* leftpage is allocated after choosing a split point. rightpage's new
1255+
* buffer isn't acquired until after leftpage is initialized and has new
1256+
* high key, the last point where splitting the page may fail (barring
1257+
* corruption). Failing before acquiring new buffer won't have lasting
1258+
* consequences, since origpage won't have been modified and leftpage is
1259+
* only workspace.
12691260
*/
12701261
origpage=BufferGetPage(buf);
1271-
leftpage=PageGetTempPage(origpage);
1272-
rightpage=BufferGetPage(rbuf);
1273-
1262+
oopaque= (BTPageOpaque)PageGetSpecialPointer(origpage);
12741263
origpagenumber=BufferGetBlockNumber(buf);
1275-
rightpagenumber=BufferGetBlockNumber(rbuf);
1276-
1277-
_bt_pageinit(leftpage,BufferGetPageSize(buf));
1278-
/* rightpage was already initialized by _bt_getbuf */
12791264

12801265
/*
1281-
* Copy the original page's LSN into leftpage, which will become the
1282-
* updated version of the page. We need this because XLogInsert will
1283-
* examine the LSN and possibly dump it in a page image.
1266+
* Choose a point to split origpage at.
1267+
*
1268+
* A split point can be thought of as a point _between_ two existing
1269+
* tuples on origpage (lastleft and firstright tuples), provided you
1270+
* pretend that the new item that didn't fit is already on origpage.
1271+
*
1272+
* Since origpage does not actually contain newitem, the representation of
1273+
* split points needs to work with two boundary cases: splits where
1274+
* newitem is lastleft, and splits where newitem is firstright.
1275+
* newitemonleft resolves the ambiguity that would otherwise exist when
1276+
* newitemoff == firstright. In all other cases it's clear which side of
1277+
* the split every tuple goes on from context. newitemonleft is usually
1278+
* (but not always) redundant information.
12841279
*/
1285-
PageSetLSN(leftpage,PageGetLSN(origpage));
1280+
firstright=_bt_findsplitloc(rel,origpage,newitemoff,newitemsz,
1281+
newitem,&newitemonleft);
12861282

1287-
/* init btree private data */
1288-
oopaque= (BTPageOpaque)PageGetSpecialPointer(origpage);
1283+
/* Allocate temp buffer for leftpage */
1284+
leftpage=PageGetTempPage(origpage);
1285+
_bt_pageinit(leftpage,BufferGetPageSize(buf));
12891286
lopaque= (BTPageOpaque)PageGetSpecialPointer(leftpage);
1290-
ropaque= (BTPageOpaque)PageGetSpecialPointer(rightpage);
12911287

1292-
isleaf=P_ISLEAF(oopaque);
1293-
1294-
/* if we're splitting this page, it won't be the root when we're done */
1295-
/* also, clear the SPLIT_END and HAS_GARBAGE flags in both pages */
1288+
/*
1289+
* leftpage won't be the root when we're done. Also, clear the SPLIT_END
1290+
* and HAS_GARBAGE flags.
1291+
*/
12961292
lopaque->btpo_flags=oopaque->btpo_flags;
12971293
lopaque->btpo_flags &= ~(BTP_ROOT |BTP_SPLIT_END |BTP_HAS_GARBAGE);
1298-
ropaque->btpo_flags=lopaque->btpo_flags;
1299-
/* set flag in left page indicating that the right page has no downlink */
1294+
/* set flag in leftpage indicating that rightpage has no downlink yet */
13001295
lopaque->btpo_flags |=BTP_INCOMPLETE_SPLIT;
13011296
lopaque->btpo_prev=oopaque->btpo_prev;
1302-
lopaque->btpo_next=rightpagenumber;
1303-
ropaque->btpo_prev=origpagenumber;
1304-
ropaque->btpo_next=oopaque->btpo_next;
1305-
lopaque->btpo.level=ropaque->btpo.level=oopaque->btpo.level;
1306-
/* Since we already have write-lock on both pages, ok to read cycleid */
1307-
lopaque->btpo_cycleid=_bt_vacuum_cycleid(rel);
1308-
ropaque->btpo_cycleid=lopaque->btpo_cycleid;
1297+
/* handle btpo_next after rightpage buffer acquired */
1298+
lopaque->btpo.level=oopaque->btpo.level;
1299+
/* handle btpo_cycleid after rightpage buffer acquired */
13091300

13101301
/*
1311-
* If the page we're splitting is not the rightmost page at its level in
1312-
* the tree, then the first entry on the page is the high key for the
1313-
* page. We need to copy that to the right half. Otherwise (meaning the
1314-
* rightmost page case), all the items on the right half will be user
1315-
* data.
1302+
* Copy the original page's LSN into leftpage, which will become the
1303+
* updated version of the page. We need this because XLogInsert will
1304+
* examine the LSN and possibly dump it in a page image.
13161305
*/
1317-
rightoff=P_HIKEY;
1318-
1319-
if (!P_RIGHTMOST(oopaque))
1320-
{
1321-
itemid=PageGetItemId(origpage,P_HIKEY);
1322-
itemsz=ItemIdGetLength(itemid);
1323-
item= (IndexTuple)PageGetItem(origpage,itemid);
1324-
Assert(BTreeTupleGetNAtts(item,rel)>0);
1325-
Assert(BTreeTupleGetNAtts(item,rel) <=indnkeyatts);
1326-
if (PageAddItem(rightpage, (Item)item,itemsz,rightoff,
1327-
false, false)==InvalidOffsetNumber)
1328-
{
1329-
memset(rightpage,0,BufferGetPageSize(rbuf));
1330-
elog(ERROR,"failed to add hikey to the right sibling"
1331-
" while splitting block %u of index \"%s\"",
1332-
origpagenumber,RelationGetRelationName(rel));
1333-
}
1334-
rightoff=OffsetNumberNext(rightoff);
1335-
}
1306+
PageSetLSN(leftpage,PageGetLSN(origpage));
1307+
isleaf=P_ISLEAF(oopaque);
13361308

13371309
/*
13381310
* The "high key" for the new left page will be the first key that's going
1339-
* to go into the new right page, or possibly a truncated version if this
1340-
* is a leaf page split. This might be either the existing data item at
1341-
* position firstright, or the incoming tuple.
1311+
* to go into the new right page, or a truncated version if this is a leaf
1312+
* page split.
13421313
*
13431314
* The high key for the left page is formed using the first item on the
13441315
* right page, which may seem to be contrary to Lehman & Yao's approach of
@@ -1360,7 +1331,6 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
13601331
* tuple could be physically larger despite being opclass-equal in respect
13611332
* of all attributes prior to the heap TID attribute.)
13621333
*/
1363-
leftoff=P_HIKEY;
13641334
if (!newitemonleft&&newitemoff==firstright)
13651335
{
13661336
/* incoming tuple will become first on right page */
@@ -1416,23 +1386,91 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
14161386
else
14171387
lefthikey=item;
14181388

1389+
/*
1390+
* Add new high key to leftpage
1391+
*/
1392+
leftoff=P_HIKEY;
1393+
14191394
Assert(BTreeTupleGetNAtts(lefthikey,rel)>0);
14201395
Assert(BTreeTupleGetNAtts(lefthikey,rel) <=indnkeyatts);
14211396
if (PageAddItem(leftpage, (Item)lefthikey,itemsz,leftoff,
14221397
false, false)==InvalidOffsetNumber)
1423-
{
1424-
memset(rightpage,0,BufferGetPageSize(rbuf));
14251398
elog(ERROR,"failed to add hikey to the left sibling"
14261399
" while splitting block %u of index \"%s\"",
14271400
origpagenumber,RelationGetRelationName(rel));
1428-
}
14291401
leftoff=OffsetNumberNext(leftoff);
14301402
/* be tidy */
14311403
if (lefthikey!=item)
14321404
pfree(lefthikey);
14331405

14341406
/*
1435-
* Now transfer all the data items to the appropriate page.
1407+
* Acquire a new right page to split into, now that left page has a new
1408+
* high key. From here on, it's not okay to throw an error without
1409+
* zeroing rightpage first. This coding rule ensures that we won't
1410+
* confuse future VACUUM operations, which might otherwise try to re-find
1411+
* a downlink to a leftover junk page as the page undergoes deletion.
1412+
*
1413+
* It would be reasonable to start the critical section just after the new
1414+
* rightpage buffer is acquired instead; that would allow us to avoid
1415+
* leftover junk pages without bothering to zero rightpage. We do it this
1416+
* way because it avoids an unnecessary PANIC when either origpage or its
1417+
* existing sibling page are corrupt.
1418+
*/
1419+
rbuf=_bt_getbuf(rel,P_NEW,BT_WRITE);
1420+
rightpage=BufferGetPage(rbuf);
1421+
rightpagenumber=BufferGetBlockNumber(rbuf);
1422+
/* rightpage was initialized by _bt_getbuf */
1423+
ropaque= (BTPageOpaque)PageGetSpecialPointer(rightpage);
1424+
1425+
/*
1426+
* Finish off remaining leftpage special area fields. They cannot be set
1427+
* before both origpage (leftpage) and rightpage buffers are acquired and
1428+
* locked.
1429+
*/
1430+
lopaque->btpo_next=rightpagenumber;
1431+
lopaque->btpo_cycleid=_bt_vacuum_cycleid(rel);
1432+
1433+
/*
1434+
* rightpage won't be the root when we're done. Also, clear the SPLIT_END
1435+
* and HAS_GARBAGE flags.
1436+
*/
1437+
ropaque->btpo_flags=oopaque->btpo_flags;
1438+
ropaque->btpo_flags &= ~(BTP_ROOT |BTP_SPLIT_END |BTP_HAS_GARBAGE);
1439+
ropaque->btpo_prev=origpagenumber;
1440+
ropaque->btpo_next=oopaque->btpo_next;
1441+
ropaque->btpo.level=oopaque->btpo.level;
1442+
ropaque->btpo_cycleid=lopaque->btpo_cycleid;
1443+
1444+
/*
1445+
* Add new high key to rightpage where necessary.
1446+
*
1447+
* If the page we're splitting is not the rightmost page at its level in
1448+
* the tree, then the first entry on the page is the high key from
1449+
* origpage.
1450+
*/
1451+
rightoff=P_HIKEY;
1452+
1453+
if (!P_RIGHTMOST(oopaque))
1454+
{
1455+
itemid=PageGetItemId(origpage,P_HIKEY);
1456+
itemsz=ItemIdGetLength(itemid);
1457+
item= (IndexTuple)PageGetItem(origpage,itemid);
1458+
Assert(BTreeTupleGetNAtts(item,rel)>0);
1459+
Assert(BTreeTupleGetNAtts(item,rel) <=indnkeyatts);
1460+
if (PageAddItem(rightpage, (Item)item,itemsz,rightoff,
1461+
false, false)==InvalidOffsetNumber)
1462+
{
1463+
memset(rightpage,0,BufferGetPageSize(rbuf));
1464+
elog(ERROR,"failed to add hikey to the right sibling"
1465+
" while splitting block %u of index \"%s\"",
1466+
origpagenumber,RelationGetRelationName(rel));
1467+
}
1468+
rightoff=OffsetNumberNext(rightoff);
1469+
}
1470+
1471+
/*
1472+
* Now transfer all the data items (non-pivot tuples in isleaf case, or
1473+
* additional pivot tuples in !isleaf case) to the appropriate page.
14361474
*
14371475
* Note: we *must* insert at least the right page's items in item-number
14381476
* order, for the benefit of _bt_restore_page().
@@ -1450,6 +1488,7 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
14501488
{
14511489
if (newitemonleft)
14521490
{
1491+
Assert(newitemoff <=firstright);
14531492
if (!_bt_pgaddtup(leftpage,newitemsz,newitem,leftoff))
14541493
{
14551494
memset(rightpage,0,BufferGetPageSize(rbuf));
@@ -1461,6 +1500,7 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
14611500
}
14621501
else
14631502
{
1503+
Assert(newitemoff >=firstright);
14641504
if (!_bt_pgaddtup(rightpage,newitemsz,newitem,rightoff))
14651505
{
14661506
memset(rightpage,0,BufferGetPageSize(rbuf));
@@ -1523,7 +1563,6 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
15231563
* all readers release locks on a page before trying to fetch its
15241564
* neighbors.
15251565
*/
1526-
15271566
if (!P_RIGHTMOST(oopaque))
15281567
{
15291568
sbuf=_bt_getbuf(rel,oopaque->btpo_next,BT_WRITE);

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp