@@ -93,8 +93,7 @@ typedef struct BTParallelScanDescData *BTParallelScanDesc;
9393static void btvacuumscan (IndexVacuumInfo * info ,IndexBulkDeleteResult * stats ,
9494IndexBulkDeleteCallback callback ,void * callback_state ,
9595BTCycleId cycleid );
96- static void btvacuumpage (BTVacState * vstate ,BlockNumber blkno ,
97- BlockNumber orig_blkno );
96+ static void btvacuumpage (BTVacState * vstate ,BlockNumber scanblkno );
9897static BTVacuumPosting btreevacuumposting (BTVacState * vstate ,
9998IndexTuple posting ,
10099OffsetNumber updatedoffset ,
@@ -959,7 +958,7 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
959958Relation rel = info -> index ;
960959BTVacState vstate ;
961960BlockNumber num_pages ;
962- BlockNumber blkno ;
961+ BlockNumber scanblkno ;
963962bool needLock ;
964963
965964/*
@@ -1009,7 +1008,7 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
10091008 */
10101009needLock = !RELATION_IS_LOCAL (rel );
10111010
1012- blkno = BTREE_METAPAGE + 1 ;
1011+ scanblkno = BTREE_METAPAGE + 1 ;
10131012for (;;)
10141013{
10151014/* Get the current relation length */
@@ -1024,15 +1023,15 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
10241023num_pages );
10251024
10261025/* Quit if we've scanned the whole relation */
1027- if (blkno >=num_pages )
1026+ if (scanblkno >=num_pages )
10281027break ;
10291028/* Iterate over pages, then loop back to recheck length */
1030- for (;blkno < num_pages ;blkno ++ )
1029+ for (;scanblkno < num_pages ;scanblkno ++ )
10311030{
1032- btvacuumpage (& vstate ,blkno , blkno );
1031+ btvacuumpage (& vstate ,scanblkno );
10331032if (info -> report_progress )
10341033pgstat_progress_update_param (PROGRESS_SCAN_BLOCKS_DONE ,
1035- blkno );
1034+ scanblkno );
10361035}
10371036}
10381037
@@ -1076,31 +1075,33 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
10761075/*
10771076 * btvacuumpage --- VACUUM one page
10781077 *
1079- * This processes a single page for btvacuumscan(). In some cases we
1080- * must go back and re-examine previously-scanned pages; this routine
1081- * recurses when necessary to handle that case.
1082- *
1083- * blkno is the page to process. orig_blkno is the highest block number
1084- * reached by the outer btvacuumscan loop (the same as blkno, unless we
1085- * are recursing to re-examine a previous page).
1078+ * This processes a single page for btvacuumscan(). In some cases we must
1079+ * backtrack to re-examine and VACUUM pages that were the scanblkno during
1080+ * a previous call here. This is how we handle page splits (that happened
1081+ * after our cycleid was acquired) whose right half page happened to reuse
1082+ * a block that we might have processed at some point before it was
1083+ * recycled (i.e. before the page split).
10861084 */
10871085static void
1088- btvacuumpage (BTVacState * vstate ,BlockNumber blkno , BlockNumber orig_blkno )
1086+ btvacuumpage (BTVacState * vstate ,BlockNumber scanblkno )
10891087{
10901088IndexVacuumInfo * info = vstate -> info ;
10911089IndexBulkDeleteResult * stats = vstate -> stats ;
10921090IndexBulkDeleteCallback callback = vstate -> callback ;
10931091void * callback_state = vstate -> callback_state ;
10941092Relation rel = info -> index ;
1095- bool delete_now ;
1096- BlockNumber recurse_to ;
1093+ bool attempt_pagedel ;
1094+ BlockNumber blkno , backtrack_to ;
10971095Buffer buf ;
10981096Page page ;
1099- BTPageOpaque opaque = NULL ;
1097+ BTPageOpaque opaque ;
1098+
1099+ blkno = scanblkno ;
1100+
1101+ backtrack :
11001102
1101- restart :
1102- delete_now = false;
1103- recurse_to = P_NONE ;
1103+ attempt_pagedel = false;
1104+ backtrack_to = P_NONE ;
11041105
11051106/* call vacuum_delay_point while not holding any buffer lock */
11061107vacuum_delay_point ();
@@ -1115,24 +1116,59 @@ btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
11151116info -> strategy );
11161117LockBuffer (buf ,BT_READ );
11171118page = BufferGetPage (buf );
1119+ opaque = NULL ;
11181120if (!PageIsNew (page ))
11191121{
11201122_bt_checkpage (rel ,buf );
11211123opaque = (BTPageOpaque )PageGetSpecialPointer (page );
11221124}
11231125
1124- /*
1125- * If we are recursing, the only case we want to do anything with is a
1126- * live leaf page having the current vacuum cycle ID. Any other state
1127- * implies we already saw the page (eg, deleted it as being empty).
1128- */
1129- if (blkno != orig_blkno )
1126+ Assert (blkno <=scanblkno );
1127+ if (blkno != scanblkno )
11301128{
1131- if (_bt_page_recyclable (page )||
1132- P_IGNORE (opaque )||
1133- !P_ISLEAF (opaque )||
1134- opaque -> btpo_cycleid != vstate -> cycleid )
1129+ /*
1130+ * We're backtracking.
1131+ *
1132+ * We followed a right link to a sibling leaf page (a page that
1133+ * happens to be from a block located before scanblkno). The only
1134+ * case we want to do anything with is a live leaf page having the
1135+ * current vacuum cycle ID.
1136+ *
1137+ * The page had better be in a state that's consistent with what we
1138+ * expect. Check for conditions that imply corruption in passing. It
1139+ * can't be half-dead because only an interrupted VACUUM process can
1140+ * leave pages in that state, so we'd definitely have dealt with it
1141+ * back when the page was the scanblkno page (half-dead pages are
1142+ * always marked fully deleted by _bt_pagedel()). This assumes that
1143+ * there can be only one vacuum process running at a time.
1144+ */
1145+ if (!opaque || !P_ISLEAF (opaque )|| P_ISHALFDEAD (opaque ))
11351146{
1147+ Assert (false);
1148+ ereport (LOG ,
1149+ (errcode (ERRCODE_INDEX_CORRUPTED ),
1150+ errmsg_internal ("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"" ,
1151+ blkno ,scanblkno ,RelationGetRelationName (rel ))));
1152+ _bt_relbuf (rel ,buf );
1153+ return ;
1154+ }
1155+
1156+ /*
1157+ * We may have already processed the page in an earlier call, when the
1158+ * page was scanblkno. This happens when the leaf page split occurred
1159+ * after the scan began, but before the right sibling page became the
1160+ * scanblkno.
1161+ *
1162+ * Page may also have been deleted by current btvacuumpage() call,
1163+ * since _bt_pagedel() sometimes deletes the right sibling page of
1164+ * scanblkno in passing (it does so after we decided where to
1165+ * backtrack to). We don't need to process this page as a deleted
1166+ * page a second time now (in fact, it would be wrong to count it as a
1167+ * deleted page in the bulk delete statistics a second time).
1168+ */
1169+ if (opaque -> btpo_cycleid != vstate -> cycleid || P_ISDELETED (opaque ))
1170+ {
1171+ /* Done with current scanblkno (and all lower split pages) */
11361172_bt_relbuf (rel ,buf );
11371173return ;
11381174}
@@ -1165,7 +1201,7 @@ btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
11651201 * Half-dead leaf page. Try to delete now. Might update
11661202 * oldestBtpoXact and pages_deleted below.
11671203 */
1168- delete_now = true;
1204+ attempt_pagedel = true;
11691205}
11701206else if (P_ISLEAF (opaque ))
11711207{
@@ -1189,18 +1225,20 @@ btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
11891225LockBufferForCleanup (buf );
11901226
11911227/*
1192- * Check whether we need to recurse back to earlier pages. What we
1193- * are concerned about is a page split that happened since we started
1194- * the vacuum scan. If the split moved some tuples to a lower page
1195- * then we might have missed 'em. If so, set up for tail recursion.
1196- * (Must do this before possibly clearing btpo_cycleid below!)
1228+ * Check whether we need to backtrack to earlier pages. What we are
1229+ * concerned about is a page split that happened since we started the
1230+ * vacuum scan. If the split moved tuples on the right half of the
1231+ * split (i.e. the tuples that sort high) to a block that we already
1232+ * passed over, then we might have missed the tuples. We need to
1233+ * backtrack now. (Must do this before possibly clearing btpo_cycleid
1234+ * or deleting scanblkno page below!)
11971235 */
11981236if (vstate -> cycleid != 0 &&
11991237opaque -> btpo_cycleid == vstate -> cycleid &&
12001238!(opaque -> btpo_flags & BTP_SPLIT_END )&&
12011239!P_RIGHTMOST (opaque )&&
1202- opaque -> btpo_next < orig_blkno )
1203- recurse_to = opaque -> btpo_next ;
1240+ opaque -> btpo_next < scanblkno )
1241+ backtrack_to = opaque -> btpo_next ;
12041242
12051243/*
12061244 * When each VACUUM begins, it determines an OldestXmin cutoff value.
@@ -1311,7 +1349,7 @@ btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
13111349 */
13121350if (ndeletable > 0 || nupdatable > 0 )
13131351{
1314- Assert (nhtidsdead >=Max ( ndeletable , 1 ) );
1352+ Assert (nhtidsdead >=ndeletable + nupdatable );
13151353_bt_delitems_vacuum (rel ,buf ,deletable ,ndeletable ,updatable ,
13161354nupdatable );
13171355
@@ -1347,19 +1385,19 @@ btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
13471385/*
13481386 * If the leaf page is now empty, try to delete it; else count the
13491387 * live tuples (live table TIDs in posting lists are counted as
1350- * separate live tuples). We don't delete whenrecursing , though, to
1351- *avoid putting entries into freePages out-of-order (doesn't seem
1352- * worthany extra code tohandle the case ).
1388+ * separate live tuples). We don't delete whenbacktracking , though,
1389+ *since that would require teaching _bt_pagedel() about backtracking
1390+ *(doesn't seem worthadding more complexity todeal with that ).
13531391 */
13541392if (minoff > maxoff )
1355- delete_now = (blkno == orig_blkno );
1393+ attempt_pagedel = (blkno == scanblkno );
13561394else
13571395stats -> num_index_tuples += nhtidslive ;
13581396
1359- Assert (!delete_now || nhtidslive == 0 );
1397+ Assert (!attempt_pagedel || nhtidslive == 0 );
13601398}
13611399
1362- if (delete_now )
1400+ if (attempt_pagedel )
13631401{
13641402MemoryContext oldcontext ;
13651403
@@ -1372,6 +1410,7 @@ btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
13721410 * any page that a future call here from btvacuumscan is expected to
13731411 * count. There will be no double-counting.
13741412 */
1413+ Assert (blkno == scanblkno );
13751414stats -> pages_deleted += _bt_pagedel (rel ,buf ,& vstate -> oldestBtpoXact );
13761415
13771416MemoryContextSwitchTo (oldcontext );
@@ -1380,18 +1419,10 @@ btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
13801419else
13811420_bt_relbuf (rel ,buf );
13821421
1383- /*
1384- * This is really tail recursion, but if the compiler is too stupid to
1385- * optimize it as such, we'd eat an uncomfortably large amount of stack
1386- * space per recursion level (due to the arrays used to track details of
1387- * deletable/updatable items). A failure is improbable since the number
1388- * of levels isn't likely to be large ... but just in case, let's
1389- * hand-optimize into a loop.
1390- */
1391- if (recurse_to != P_NONE )
1422+ if (backtrack_to != P_NONE )
13921423{
1393- blkno = recurse_to ;
1394- gotorestart ;
1424+ blkno = backtrack_to ;
1425+ gotobacktrack ;
13951426}
13961427}
13971428