Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitbf78f50

Browse files
Don't leave behind junk nbtree pages during split.
Commit8fa30f9 reduced the elevel of a number of "can't happen"_bt_split() errors from PANIC to ERROR. At the same time, the new rightpage buffer for the split could continue to be acquired well before thecritical section. This was possible because it was relativelystraightforward to make sure that _bt_split() could not throw an error,with a few specific exceptions. The exceptional cases were safe becausethey involved specific, well understood errors, making it possible toconsistently zero the right page before actually raising an error usingelog(). There was no danger of leaving around a junk page, provided_bt_split() stuck to this coding rule.Commit8224de4, which introduced INCLUDE indexes, added code to make_bt_split() truncate away non-key attributes. This happened at a pointthat broke the rule around zeroing the right page in _bt_split(). Iftruncation failed (perhaps due to palloc() failure), that would resultin an errant right page buffer with junk contents. This could confuseVACUUM when it attempted to delete the page, and should be avoided ongeneral principle.To fix, reorganize _bt_split() so that truncation occurs before the newright page buffer is even acquired. A junk page/buffer will not be leftbehind if _bt_nonkey_truncate()/_bt_truncate() raise an error.Discussion:https://postgr.es/m/CAH2-WzkcWT_-NH7EeL=Az4efg0KCV+wArygW8zKB=+HoP=VWMw@mail.gmail.comBackpatch: 11-, where INCLUDE indexes were introduced.
1 parent6b0e941 commitbf78f50

File tree

1 file changed

+123
-84
lines changed

1 file changed

+123
-84
lines changed

‎src/backend/access/nbtree/nbtinsert.c

Lines changed: 123 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -71,8 +71,7 @@ static void _bt_insertonpg(Relation rel, Buffer buf, Buffer cbuf,
7171
OffsetNumbernewitemoff,
7272
boolsplit_only_page);
7373
staticBuffer_bt_split(Relationrel,Bufferbuf,Buffercbuf,
74-
OffsetNumberfirstright,OffsetNumbernewitemoff,Sizenewitemsz,
75-
IndexTuplenewitem,boolnewitemonleft);
74+
OffsetNumbernewitemoff,Sizenewitemsz,IndexTuplenewitem);
7675
staticvoid_bt_insert_parent(Relationrel,Bufferbuf,Bufferrbuf,
7776
BTStackstack,boolis_root,boolis_only);
7877
staticOffsetNumber_bt_findsplitloc(Relationrel,Pagepage,
@@ -842,7 +841,6 @@ _bt_insertonpg(Relation rel,
842841
{
843842
Pagepage;
844843
BTPageOpaquelpageop;
845-
OffsetNumberfirstright=InvalidOffsetNumber;
846844
Sizeitemsz;
847845

848846
page=BufferGetPage(buf);
@@ -878,7 +876,6 @@ _bt_insertonpg(Relation rel,
878876
{
879877
boolis_root=P_ISROOT(lpageop);
880878
boolis_only=P_LEFTMOST(lpageop)&&P_RIGHTMOST(lpageop);
881-
boolnewitemonleft;
882879
Bufferrbuf;
883880

884881
/*
@@ -899,14 +896,8 @@ _bt_insertonpg(Relation rel,
899896
Assert(!(P_ISLEAF(lpageop)&&
900897
BlockNumberIsValid(RelationGetTargetBlock(rel))));
901898

902-
/* Choose the split point */
903-
firstright=_bt_findsplitloc(rel,page,
904-
newitemoff,itemsz,
905-
&newitemonleft);
906-
907899
/* split the buffer into left and right halves */
908-
rbuf=_bt_split(rel,buf,cbuf,firstright,
909-
newitemoff,itemsz,itup,newitemonleft);
900+
rbuf=_bt_split(rel,buf,cbuf,newitemoff,itemsz,itup);
910901
PredicateLockPageSplit(rel,
911902
BufferGetBlockNumber(buf),
912903
BufferGetBlockNumber(rbuf));
@@ -1106,9 +1097,8 @@ _bt_insertonpg(Relation rel,
11061097
*_bt_split() -- split a page in the btree.
11071098
*
11081099
*On entry, buf is the page to split, and is pinned and write-locked.
1109-
*firstright is the item index of the first item to be moved to the
1110-
*new right page. newitemoff etc. tell us about the new item that
1111-
*must be inserted along with the data from the old page.
1100+
*newitemoff etc. tell us about the new item that must be inserted
1101+
*along with the data from the original page.
11121102
*
11131103
*When splitting a non-leaf page, 'cbuf' is the left-sibling of the
11141104
*page we're inserting the downlink for. This function will clear the
@@ -1118,9 +1108,8 @@ _bt_insertonpg(Relation rel,
11181108
*The pin and lock on buf are maintained.
11191109
*/
11201110
staticBuffer
1121-
_bt_split(Relationrel,Bufferbuf,Buffercbuf,OffsetNumberfirstright,
1122-
OffsetNumbernewitemoff,Sizenewitemsz,IndexTuplenewitem,
1123-
boolnewitemonleft)
1111+
_bt_split(Relationrel,Bufferbuf,Buffercbuf,OffsetNumbernewitemoff,
1112+
Sizenewitemsz,IndexTuplenewitem)
11241113
{
11251114
Bufferrbuf;
11261115
Pageorigpage;
@@ -1139,99 +1128,81 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
11391128
IndexTupleitem;
11401129
OffsetNumberleftoff,
11411130
rightoff;
1131+
OffsetNumberfirstright;
11421132
OffsetNumbermaxoff;
11431133
OffsetNumberi;
1144-
boolisleaf;
1134+
boolnewitemonleft,
1135+
isleaf;
11451136
IndexTuplelefthikey;
11461137
intindnatts=IndexRelationGetNumberOfAttributes(rel);
11471138
intindnkeyatts=IndexRelationGetNumberOfKeyAttributes(rel);
11481139

1149-
/* Acquire a new page to split into */
1150-
rbuf=_bt_getbuf(rel,P_NEW,BT_WRITE);
1151-
11521140
/*
11531141
* origpage is the original page to be split. leftpage is a temporary
11541142
* buffer that receives the left-sibling data, which will be copied back
1155-
* into origpage on success. rightpage is the new page that receives the
1156-
* right-sibling data. If we fail before reaching the critical section,
1157-
* origpage hasn't been modified and leftpage is only workspace. In
1158-
* principle we shouldn't need to worry about rightpage either, because it
1159-
* hasn't been linked into the btree page structure; but to avoid leaving
1160-
* possibly-confusing junk behind, we are careful to rewrite rightpage as
1161-
* zeroes before throwing any error.
1143+
* into origpage on success. rightpage is the new page that will receive
1144+
* the right-sibling data.
1145+
*
1146+
* leftpage is allocated after choosing a split point. rightpage's new
1147+
* buffer isn't acquired until after leftpage is initialized and has new
1148+
* high key, the last point where splitting the page may fail (barring
1149+
* corruption). Failing before acquiring new buffer won't have lasting
1150+
* consequences, since origpage won't have been modified and leftpage is
1151+
* only workspace.
11621152
*/
11631153
origpage=BufferGetPage(buf);
1164-
leftpage=PageGetTempPage(origpage);
1165-
rightpage=BufferGetPage(rbuf);
1166-
1154+
oopaque= (BTPageOpaque)PageGetSpecialPointer(origpage);
11671155
origpagenumber=BufferGetBlockNumber(buf);
1168-
rightpagenumber=BufferGetBlockNumber(rbuf);
1169-
1170-
_bt_pageinit(leftpage,BufferGetPageSize(buf));
1171-
/* rightpage was already initialized by _bt_getbuf */
11721156

11731157
/*
1174-
* Copy the original page's LSN into leftpage, which will become the
1175-
* updated version of the page. We need this because XLogInsert will
1176-
* examine the LSN and possibly dump it in a page image.
1158+
* Choose a point to split origpage at.
1159+
*
1160+
* A split point can be thought of as a point _between_ two existing
1161+
* tuples on origpage (lastleft and firstright tuples), provided you
1162+
* pretend that the new item that didn't fit is already on origpage.
1163+
*
1164+
* Since origpage does not actually contain newitem, the representation of
1165+
* split points needs to work with two boundary cases: splits where
1166+
* newitem is lastleft, and splits where newitem is firstright.
1167+
* newitemonleft resolves the ambiguity that would otherwise exist when
1168+
* newitemoff == firstright. In all other cases it's clear which side of
1169+
* the split every tuple goes on from context. newitemonleft is usually
1170+
* (but not always) redundant information.
11771171
*/
1178-
PageSetLSN(leftpage,PageGetLSN(origpage));
1172+
firstright=_bt_findsplitloc(rel,origpage,newitemoff,newitemsz,
1173+
&newitemonleft);
11791174

1180-
/* init btree private data */
1181-
oopaque= (BTPageOpaque)PageGetSpecialPointer(origpage);
1175+
/* Allocate temp buffer for leftpage */
1176+
leftpage=PageGetTempPage(origpage);
1177+
_bt_pageinit(leftpage,BufferGetPageSize(buf));
11821178
lopaque= (BTPageOpaque)PageGetSpecialPointer(leftpage);
1183-
ropaque= (BTPageOpaque)PageGetSpecialPointer(rightpage);
1184-
1185-
isleaf=P_ISLEAF(oopaque);
11861179

1187-
/* if we're splitting this page, it won't be the root when we're done */
1188-
/* also, clear the SPLIT_END and HAS_GARBAGE flags in both pages */
1180+
/*
1181+
* leftpage won't be the root when we're done. Also, clear the SPLIT_END
1182+
* and HAS_GARBAGE flags.
1183+
*/
11891184
lopaque->btpo_flags=oopaque->btpo_flags;
11901185
lopaque->btpo_flags &= ~(BTP_ROOT |BTP_SPLIT_END |BTP_HAS_GARBAGE);
1191-
ropaque->btpo_flags=lopaque->btpo_flags;
1192-
/* set flag in left page indicating that the right page has no downlink */
1186+
/* set flag in leftpage indicating that rightpage has no downlink yet */
11931187
lopaque->btpo_flags |=BTP_INCOMPLETE_SPLIT;
11941188
lopaque->btpo_prev=oopaque->btpo_prev;
1195-
lopaque->btpo_next=rightpagenumber;
1196-
ropaque->btpo_prev=origpagenumber;
1197-
ropaque->btpo_next=oopaque->btpo_next;
1198-
lopaque->btpo.level=ropaque->btpo.level=oopaque->btpo.level;
1199-
/* Since we already have write-lock on both pages, ok to read cycleid */
1200-
lopaque->btpo_cycleid=_bt_vacuum_cycleid(rel);
1201-
ropaque->btpo_cycleid=lopaque->btpo_cycleid;
1189+
/* handle btpo_next after rightpage buffer acquired */
1190+
lopaque->btpo.level=oopaque->btpo.level;
1191+
/* handle btpo_cycleid after rightpage buffer acquired */
12021192

12031193
/*
1204-
* If the page we're splitting is not the rightmost page at its level in
1205-
* the tree, then the first entry on the page is the high key for the
1206-
* page. We need to copy that to the right half. Otherwise (meaning the
1207-
* rightmost page case), all the items on the right half will be user
1208-
* data.
1194+
* Copy the original page's LSN into leftpage, which will become the
1195+
* updated version of the page. We need this because XLogInsert will
1196+
* examine the LSN and possibly dump it in a page image.
12091197
*/
1210-
rightoff=P_HIKEY;
1211-
1212-
if (!P_RIGHTMOST(oopaque))
1213-
{
1214-
itemid=PageGetItemId(origpage,P_HIKEY);
1215-
itemsz=ItemIdGetLength(itemid);
1216-
item= (IndexTuple)PageGetItem(origpage,itemid);
1217-
Assert(BTreeTupleGetNAtts(item,rel)==indnkeyatts);
1218-
if (PageAddItem(rightpage, (Item)item,itemsz,rightoff,
1219-
false, false)==InvalidOffsetNumber)
1220-
{
1221-
memset(rightpage,0,BufferGetPageSize(rbuf));
1222-
elog(ERROR,"failed to add hikey to the right sibling"
1223-
" while splitting block %u of index \"%s\"",
1224-
origpagenumber,RelationGetRelationName(rel));
1225-
}
1226-
rightoff=OffsetNumberNext(rightoff);
1227-
}
1198+
PageSetLSN(leftpage,PageGetLSN(origpage));
1199+
isleaf=P_ISLEAF(oopaque);
12281200

12291201
/*
12301202
* The "high key" for the new left page will be the first key that's going
12311203
* to go into the new right page. This might be either the existing data
12321204
* item at position firstright, or the incoming tuple.
12331205
*/
1234-
leftoff=P_HIKEY;
12351206
if (!newitemonleft&&newitemoff==firstright)
12361207
{
12371208
/* incoming tuple will become first on right page */
@@ -1265,22 +1236,89 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
12651236
else
12661237
lefthikey=item;
12671238

1239+
/*
1240+
* Add new high key to leftpage
1241+
*/
1242+
leftoff=P_HIKEY;
1243+
12681244
Assert(BTreeTupleGetNAtts(lefthikey,rel)==indnkeyatts);
12691245
if (PageAddItem(leftpage, (Item)lefthikey,itemsz,leftoff,
12701246
false, false)==InvalidOffsetNumber)
1271-
{
1272-
memset(rightpage,0,BufferGetPageSize(rbuf));
12731247
elog(ERROR,"failed to add hikey to the left sibling"
12741248
" while splitting block %u of index \"%s\"",
12751249
origpagenumber,RelationGetRelationName(rel));
1276-
}
12771250
leftoff=OffsetNumberNext(leftoff);
12781251
/* be tidy */
12791252
if (lefthikey!=item)
12801253
pfree(lefthikey);
12811254

12821255
/*
1283-
* Now transfer all the data items to the appropriate page.
1256+
* Acquire a new right page to split into, now that left page has a new
1257+
* high key. From here on, it's not okay to throw an error without
1258+
* zeroing rightpage first. This coding rule ensures that we won't
1259+
* confuse future VACUUM operations, which might otherwise try to re-find
1260+
* a downlink to a leftover junk page as the page undergoes deletion.
1261+
*
1262+
* It would be reasonable to start the critical section just after the new
1263+
* rightpage buffer is acquired instead; that would allow us to avoid
1264+
* leftover junk pages without bothering to zero rightpage. We do it this
1265+
* way because it avoids an unnecessary PANIC when either origpage or its
1266+
* existing sibling page are corrupt.
1267+
*/
1268+
rbuf=_bt_getbuf(rel,P_NEW,BT_WRITE);
1269+
rightpage=BufferGetPage(rbuf);
1270+
rightpagenumber=BufferGetBlockNumber(rbuf);
1271+
/* rightpage was initialized by _bt_getbuf */
1272+
ropaque= (BTPageOpaque)PageGetSpecialPointer(rightpage);
1273+
1274+
/*
1275+
* Finish off remaining leftpage special area fields. They cannot be set
1276+
* before both origpage (leftpage) and rightpage buffers are acquired and
1277+
* locked.
1278+
*/
1279+
lopaque->btpo_next=rightpagenumber;
1280+
lopaque->btpo_cycleid=_bt_vacuum_cycleid(rel);
1281+
1282+
/*
1283+
* rightpage won't be the root when we're done. Also, clear the SPLIT_END
1284+
* and HAS_GARBAGE flags.
1285+
*/
1286+
ropaque->btpo_flags=oopaque->btpo_flags;
1287+
ropaque->btpo_flags &= ~(BTP_ROOT |BTP_SPLIT_END |BTP_HAS_GARBAGE);
1288+
ropaque->btpo_prev=origpagenumber;
1289+
ropaque->btpo_next=oopaque->btpo_next;
1290+
ropaque->btpo.level=oopaque->btpo.level;
1291+
ropaque->btpo_cycleid=lopaque->btpo_cycleid;
1292+
1293+
/*
1294+
* Add new high key to rightpage where necessary.
1295+
*
1296+
* If the page we're splitting is not the rightmost page at its level in
1297+
* the tree, then the first entry on the page is the high key from
1298+
* origpage.
1299+
*/
1300+
rightoff=P_HIKEY;
1301+
1302+
if (!P_RIGHTMOST(oopaque))
1303+
{
1304+
itemid=PageGetItemId(origpage,P_HIKEY);
1305+
itemsz=ItemIdGetLength(itemid);
1306+
item= (IndexTuple)PageGetItem(origpage,itemid);
1307+
Assert(BTreeTupleGetNAtts(item,rel)==indnkeyatts);
1308+
if (PageAddItem(rightpage, (Item)item,itemsz,rightoff,
1309+
false, false)==InvalidOffsetNumber)
1310+
{
1311+
memset(rightpage,0,BufferGetPageSize(rbuf));
1312+
elog(ERROR,"failed to add hikey to the right sibling"
1313+
" while splitting block %u of index \"%s\"",
1314+
origpagenumber,RelationGetRelationName(rel));
1315+
}
1316+
rightoff=OffsetNumberNext(rightoff);
1317+
}
1318+
1319+
/*
1320+
* Now transfer all the data items (non-pivot tuples in isleaf case, or
1321+
* additional pivot tuples in !isleaf case) to the appropriate page.
12841322
*
12851323
* Note: we *must* insert at least the right page's items in item-number
12861324
* order, for the benefit of _bt_restore_page().
@@ -1298,6 +1336,7 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
12981336
{
12991337
if (newitemonleft)
13001338
{
1339+
Assert(newitemoff <=firstright);
13011340
if (!_bt_pgaddtup(leftpage,newitemsz,newitem,leftoff))
13021341
{
13031342
memset(rightpage,0,BufferGetPageSize(rbuf));
@@ -1309,6 +1348,7 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
13091348
}
13101349
else
13111350
{
1351+
Assert(newitemoff >=firstright);
13121352
if (!_bt_pgaddtup(rightpage,newitemsz,newitem,rightoff))
13131353
{
13141354
memset(rightpage,0,BufferGetPageSize(rbuf));
@@ -1371,7 +1411,6 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
13711411
* all readers release locks on a page before trying to fetch its
13721412
* neighbors.
13731413
*/
1374-
13751414
if (!P_RIGHTMOST(oopaque))
13761415
{
13771416
sbuf=_bt_getbuf(rel,oopaque->btpo_next,BT_WRITE);

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp