33
33
#include "storage/predicate.h"
34
34
#include "utils/snapmgr.h"
35
35
36
- static bool _bt_mark_page_halfdead (Relation rel ,Buffer buf ,BTStack stack );
36
+ static bool _bt_mark_page_halfdead (Relation rel ,Buffer leafbuf ,BTStack stack );
37
37
static bool _bt_unlink_halfdead_page (Relation rel ,Buffer leafbuf ,
38
- bool * rightsib_empty );
38
+ bool * rightsib_empty , TransactionId * oldestBtpoXact );
39
39
static bool _bt_lock_branch_parent (Relation rel ,BlockNumber child ,
40
40
BTStack stack ,Buffer * topparent ,OffsetNumber * topoff ,
41
41
BlockNumber * target ,BlockNumber * rightsib );
@@ -1219,27 +1219,35 @@ _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack,
1219
1219
}
1220
1220
1221
1221
/*
1222
- * _bt_pagedel() -- Delete a page from the b-tree, if legal to do so.
1222
+ * _bt_pagedel() -- Delete aleaf page from the b-tree, if legal to do so.
1223
1223
*
1224
- * This action unlinks the page from the b-tree structure, removing all
1224
+ * This action unlinks theleaf page from the b-tree structure, removing all
1225
1225
* pointers leading to it --- but not touching its own left and right links.
1226
1226
* The page cannot be physically reclaimed right away, since other processes
1227
1227
* may currently be trying to follow links leading to the page; they have to
1228
1228
* be allowed to use its right-link to recover. See nbtree/README.
1229
1229
*
1230
1230
* On entry, the target buffer must be pinned and locked (either read or write
1231
- * lock is OK). This lock and pin will be dropped before exiting.
1231
+ * lock is OK). The page must be an empty leaf page, which may be half-dead
1232
+ * already (a half-dead page should only be passed to us when an earlier
1233
+ * VACUUM operation was interrupted, though). Note in particular that caller
1234
+ * should never pass a buffer containing an existing deleted page here. The
1235
+ * lock and pin on caller's buffer will be dropped before we return.
1232
1236
*
1233
1237
* Returns the number of pages successfully deleted (zero if page cannot
1234
- * be deleted now; could be more than one if parent or sibling pages were
1235
- * deleted too).
1238
+ * be deleted now; could be more than one if parent or right sibling pages
1239
+ * were deleted too).
1240
+ *
1241
+ * Maintains *oldestBtpoXact for any pages that get deleted. Caller is
1242
+ * responsible for maintaining *oldestBtpoXact in the case of pages that were
1243
+ * deleted by a previous VACUUM.
1236
1244
*
1237
1245
* NOTE: this leaks memory. Rather than trying to clean up everything
1238
1246
* carefully, it's better to run it in a temp context that can be reset
1239
1247
* frequently.
1240
1248
*/
1241
1249
int
1242
- _bt_pagedel (Relation rel ,Buffer buf )
1250
+ _bt_pagedel (Relation rel ,Buffer leafbuf , TransactionId * oldestBtpoXact )
1243
1251
{
1244
1252
int ndeleted = 0 ;
1245
1253
BlockNumber rightsib ;
@@ -1260,14 +1268,21 @@ _bt_pagedel(Relation rel, Buffer buf)
1260
1268
1261
1269
for (;;)
1262
1270
{
1263
- page = BufferGetPage (buf );
1271
+ page = BufferGetPage (leafbuf );
1264
1272
opaque = (BTPageOpaque )PageGetSpecialPointer (page );
1265
1273
1266
1274
/*
1267
1275
* Internal pages are never deleted directly, only as part of deleting
1268
1276
* the whole branch all the way down to leaf level.
1277
+ *
1278
+ * Also check for deleted pages here. Caller never passes us a fully
1279
+ * deleted page. Only VACUUM can delete pages, so there can't have
1280
+ * been a concurrent deletion. Assume that we reached any deleted
1281
+ * page encountered here by following a sibling link, and that the
1282
+ * index is corrupt.
1269
1283
*/
1270
- if (!P_ISLEAF (opaque ))
1284
+ Assert (!P_ISDELETED (opaque ));
1285
+ if (!P_ISLEAF (opaque )|| P_ISDELETED (opaque ))
1271
1286
{
1272
1287
/*
1273
1288
* Pre-9.4 page deletion only marked internal pages as half-dead,
@@ -1286,13 +1301,22 @@ _bt_pagedel(Relation rel, Buffer buf)
1286
1301
errmsg ("index \"%s\" contains a half-dead internal page" ,
1287
1302
RelationGetRelationName (rel )),
1288
1303
errhint ("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it." )));
1289
- _bt_relbuf (rel ,buf );
1304
+
1305
+ if (P_ISDELETED (opaque ))
1306
+ ereport (LOG ,
1307
+ (errcode (ERRCODE_INDEX_CORRUPTED ),
1308
+ errmsg_internal ("found deleted block %u while following right link in index \"%s\"" ,
1309
+ BufferGetBlockNumber (leafbuf ),
1310
+ RelationGetRelationName (rel ))));
1311
+
1312
+ _bt_relbuf (rel ,leafbuf );
1290
1313
return ndeleted ;
1291
1314
}
1292
1315
1293
1316
/*
1294
1317
* We can never delete rightmost pages nor root pages. While at it,
1295
- * check that page is not already deleted and is empty.
1318
+ * check that page is empty, since it's possible that the leafbuf page
1319
+ * was empty a moment ago, but has since had some inserts.
1296
1320
*
1297
1321
* To keep the algorithm simple, we also never delete an incompletely
1298
1322
* split page (they should be rare enough that this doesn't make any
@@ -1307,14 +1331,14 @@ _bt_pagedel(Relation rel, Buffer buf)
1307
1331
* to. On subsequent iterations, we know we stepped right from a page
1308
1332
* that passed these tests, so it's OK.
1309
1333
*/
1310
- if (P_RIGHTMOST (opaque )|| P_ISROOT (opaque )|| P_ISDELETED ( opaque ) ||
1334
+ if (P_RIGHTMOST (opaque )|| P_ISROOT (opaque )||
1311
1335
P_FIRSTDATAKEY (opaque ) <=PageGetMaxOffsetNumber (page )||
1312
1336
P_INCOMPLETE_SPLIT (opaque ))
1313
1337
{
1314
1338
/* Should never fail to delete a half-dead page */
1315
1339
Assert (!P_ISHALFDEAD (opaque ));
1316
1340
1317
- _bt_relbuf (rel ,buf );
1341
+ _bt_relbuf (rel ,leafbuf );
1318
1342
return ndeleted ;
1319
1343
}
1320
1344
@@ -1351,7 +1375,7 @@ _bt_pagedel(Relation rel, Buffer buf)
1351
1375
* To avoid deadlocks, we'd better drop the leaf page lock
1352
1376
* before going further.
1353
1377
*/
1354
- LockBuffer (buf ,BUFFER_LOCK_UNLOCK );
1378
+ LockBuffer (leafbuf ,BUFFER_LOCK_UNLOCK );
1355
1379
1356
1380
/*
1357
1381
* Fetch the left sibling, to check that it's not marked with
@@ -1375,10 +1399,10 @@ _bt_pagedel(Relation rel, Buffer buf)
1375
1399
* incompletely-split page to be split again. So we don't
1376
1400
* need to walk right here.
1377
1401
*/
1378
- if (lopaque -> btpo_next == BufferGetBlockNumber (buf )&&
1402
+ if (lopaque -> btpo_next == BufferGetBlockNumber (leafbuf )&&
1379
1403
P_INCOMPLETE_SPLIT (lopaque ))
1380
1404
{
1381
- ReleaseBuffer (buf );
1405
+ ReleaseBuffer (leafbuf );
1382
1406
_bt_relbuf (rel ,lbuf );
1383
1407
return ndeleted ;
1384
1408
}
@@ -1395,40 +1419,59 @@ _bt_pagedel(Relation rel, Buffer buf)
1395
1419
_bt_relbuf (rel ,lbuf );
1396
1420
1397
1421
/*
1398
- * Re-lock the leaf page, and start over, to re-check that the
1399
- * page can still be deleted.
1422
+ * Re-lock the leaf page, and start over to use our stack
1423
+ * within _bt_mark_page_halfdead. We must do it that way
1424
+ * because it's possible that leafbuf can no longer be
1425
+ * deleted. We need to recheck.
1400
1426
*/
1401
- LockBuffer (buf ,BT_WRITE );
1427
+ LockBuffer (leafbuf ,BT_WRITE );
1402
1428
continue ;
1403
1429
}
1404
1430
1405
- if (!_bt_mark_page_halfdead (rel ,buf ,stack ))
1431
+ /*
1432
+ * See if it's safe to delete the leaf page, and determine how
1433
+ * many parent/internal pages above the leaf level will be
1434
+ * deleted. If it's safe then _bt_mark_page_halfdead will also
1435
+ * perform the first phase of deletion, which includes marking the
1436
+ * leafbuf page half-dead.
1437
+ */
1438
+ Assert (P_ISLEAF (opaque )&& !P_IGNORE (opaque ));
1439
+ if (!_bt_mark_page_halfdead (rel ,leafbuf ,stack ))
1406
1440
{
1407
- _bt_relbuf (rel ,buf );
1441
+ _bt_relbuf (rel ,leafbuf );
1408
1442
return ndeleted ;
1409
1443
}
1410
1444
}
1411
1445
1412
1446
/*
1413
1447
* Then unlink it from its siblings. Each call to
1414
1448
* _bt_unlink_halfdead_page unlinks the topmost page from the branch,
1415
- * making it shallower. Iterate until the leaf page is gone.
1449
+ * making it shallower. Iterate until the leafbuf page is deleted.
1450
+ *
1451
+ * _bt_unlink_halfdead_page should never fail, since we established
1452
+ * that deletion is generally safe in _bt_mark_page_halfdead.
1416
1453
*/
1417
1454
rightsib_empty = false;
1455
+ Assert (P_ISLEAF (opaque )&& P_ISHALFDEAD (opaque ));
1418
1456
while (P_ISHALFDEAD (opaque ))
1419
1457
{
1420
- /* will check for interrupts, once lock is released */
1421
- if (!_bt_unlink_halfdead_page (rel ,buf ,& rightsib_empty ))
1458
+ /* Check for interrupts in _bt_unlink_halfdead_page */
1459
+ if (!_bt_unlink_halfdead_page (rel ,leafbuf ,& rightsib_empty ,
1460
+ oldestBtpoXact ))
1422
1461
{
1423
- /* _bt_unlink_halfdead_pagealready released buffer */
1462
+ /* _bt_unlink_halfdead_pagefailed, released buffer */
1424
1463
return ndeleted ;
1425
1464
}
1426
1465
ndeleted ++ ;
1427
1466
}
1428
1467
1468
+ Assert (P_ISLEAF (opaque )&& P_ISDELETED (opaque ));
1469
+ Assert (TransactionIdFollowsOrEquals (opaque -> btpo .xact ,
1470
+ * oldestBtpoXact ));
1471
+
1429
1472
rightsib = opaque -> btpo_next ;
1430
1473
1431
- _bt_relbuf (rel ,buf );
1474
+ _bt_relbuf (rel ,leafbuf );
1432
1475
1433
1476
/*
1434
1477
* Check here, as calling loops will have locks held, preventing
@@ -1448,7 +1491,7 @@ _bt_pagedel(Relation rel, Buffer buf)
1448
1491
if (!rightsib_empty )
1449
1492
break ;
1450
1493
1451
- buf = _bt_getbuf (rel ,rightsib ,BT_WRITE );
1494
+ leafbuf = _bt_getbuf (rel ,rightsib ,BT_WRITE );
1452
1495
}
1453
1496
1454
1497
return ndeleted ;
@@ -1641,17 +1684,28 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
1641
1684
* of the whole branch, including the leaf page itself, iterate until the
1642
1685
* leaf page is deleted.
1643
1686
*
1644
- * Returns 'false' if the page could not be unlinked (shouldn't happen).
1645
- * If the (new) right sibling of the page is empty, *rightsib_empty is set
1646
- * to true.
1687
+ * Returns 'false' if the page could not be unlinked (shouldn't happen). If
1688
+ * the right sibling of the current target page is empty, *rightsib_empty is
1689
+ * set to true, allowing caller to delete the target's right sibling page in
1690
+ * passing. Note that *rightsib_empty is only actually used by caller when
1691
+ * target page is leafbuf, following last call here for leafbuf/the subtree
1692
+ * containing leafbuf. (We always set *rightsib_empty for caller, just to be
1693
+ * consistent.)
1694
+ *
1695
+ * We maintain *oldestBtpoXact for pages that are deleted by the current
1696
+ * VACUUM operation here. This must be handled here because we conservatively
1697
+ * assume that there needs to be a new call to ReadNewTransactionId() each
1698
+ * time a page gets deleted. See comments about the underlying assumption
1699
+ * below.
1647
1700
*
1648
1701
* Must hold pin and lock on leafbuf at entry (read or write doesn't matter).
1649
1702
* On success exit, we'll be holding pin and write lock. On failure exit,
1650
1703
* we'll release both pin and lock before returning (we define it that way
1651
1704
* to avoid having to reacquire a lock we already released).
1652
1705
*/
1653
1706
static bool
1654
- _bt_unlink_halfdead_page (Relation rel ,Buffer leafbuf ,bool * rightsib_empty )
1707
+ _bt_unlink_halfdead_page (Relation rel ,Buffer leafbuf ,bool * rightsib_empty ,
1708
+ TransactionId * oldestBtpoXact )
1655
1709
{
1656
1710
BlockNumber leafblkno = BufferGetBlockNumber (leafbuf );
1657
1711
BlockNumber leafleftsib ;
@@ -1788,9 +1842,9 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
1788
1842
lbuf = InvalidBuffer ;
1789
1843
1790
1844
/*
1791
- * Next write-lock the target page itself. It should be okay to takejust
1792
- *a write lock not a superexclusive lock, since noscans would stop on an
1793
- *empty page.
1845
+ * Next write-lock the target page itself. It's okay to takea write lock
1846
+ *rather than a superexclusive lock, since noscan will stop on an empty
1847
+ * page.
1794
1848
*/
1795
1849
LockBuffer (buf ,BT_WRITE );
1796
1850
page = BufferGetPage (buf );
@@ -1928,6 +1982,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
1928
1982
*/
1929
1983
page = BufferGetPage (buf );
1930
1984
opaque = (BTPageOpaque )PageGetSpecialPointer (page );
1985
+ Assert (P_ISHALFDEAD (opaque )|| !P_ISLEAF (opaque ));
1931
1986
opaque -> btpo_flags &= ~BTP_HALF_DEAD ;
1932
1987
opaque -> btpo_flags |=BTP_DELETED ;
1933
1988
opaque -> btpo .xact = ReadNewTransactionId ();
@@ -2030,6 +2085,10 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
2030
2085
_bt_relbuf (rel ,lbuf );
2031
2086
_bt_relbuf (rel ,rbuf );
2032
2087
2088
+ if (!TransactionIdIsValid (* oldestBtpoXact )||
2089
+ TransactionIdPrecedes (opaque -> btpo .xact ,* oldestBtpoXact ))
2090
+ * oldestBtpoXact = opaque -> btpo .xact ;
2091
+
2033
2092
/*
2034
2093
* Release the target, if it was not the leaf block. The leaf is always
2035
2094
* kept locked.