@@ -75,6 +75,8 @@ typedef struct BtreeCheckState
7575bool readonly ;
7676/* Also verifying heap has no unindexed tuples? */
7777bool heapallindexed ;
78+ /* Also making sure non-pivot tuples can be found by new search? */
79+ bool rootdescend ;
7880/* Per-page context */
7981MemoryContext targetcontext ;
8082/* Buffer access strategy */
@@ -124,10 +126,11 @@ PG_FUNCTION_INFO_V1(bt_index_check);
124126PG_FUNCTION_INFO_V1 (bt_index_parent_check );
125127
126128static void bt_index_check_internal (Oid indrelid ,bool parentcheck ,
127- bool heapallindexed );
129+ bool heapallindexed , bool rootdescend );
128130static inline void btree_index_checkable (Relation rel );
129131static void bt_check_every_level (Relation rel ,Relation heaprel ,
130- bool heapkeyspace ,bool readonly ,bool heapallindexed );
132+ bool heapkeyspace ,bool readonly ,bool heapallindexed ,
133+ bool rootdescend );
131134static BtreeLevel bt_check_level_from_leftmost (BtreeCheckState * state ,
132135BtreeLevel level );
133136static void bt_target_page_check (BtreeCheckState * state );
@@ -140,6 +143,7 @@ static void bt_tuple_present_callback(Relation index, HeapTuple htup,
140143bool tupleIsAlive ,void * checkstate );
141144static IndexTuple bt_normalize_tuple (BtreeCheckState * state ,
142145IndexTuple itup );
146+ static bool bt_rootdescend (BtreeCheckState * state ,IndexTuple itup );
143147static inline bool offset_is_negative_infinity (BTPageOpaque opaque ,
144148OffsetNumber offset );
145149static inline bool invariant_l_offset (BtreeCheckState * state ,BTScanInsert key ,
@@ -177,7 +181,7 @@ bt_index_check(PG_FUNCTION_ARGS)
177181if (PG_NARGS ()== 2 )
178182heapallindexed = PG_GETARG_BOOL (1 );
179183
180- bt_index_check_internal (indrelid , false,heapallindexed );
184+ bt_index_check_internal (indrelid , false,heapallindexed , false );
181185
182186PG_RETURN_VOID ();
183187}
@@ -196,11 +200,14 @@ bt_index_parent_check(PG_FUNCTION_ARGS)
196200{
197201Oid indrelid = PG_GETARG_OID (0 );
198202bool heapallindexed = false;
203+ bool rootdescend = false;
199204
200- if (PG_NARGS ()= =2 )
205+ if (PG_NARGS ()> =2 )
201206heapallindexed = PG_GETARG_BOOL (1 );
207+ if (PG_NARGS ()== 3 )
208+ rootdescend = PG_GETARG_BOOL (2 );
202209
203- bt_index_check_internal (indrelid , true,heapallindexed );
210+ bt_index_check_internal (indrelid , true,heapallindexed , rootdescend );
204211
205212PG_RETURN_VOID ();
206213}
@@ -209,7 +216,8 @@ bt_index_parent_check(PG_FUNCTION_ARGS)
209216 * Helper for bt_index_[parent_]check, coordinating the bulk of the work.
210217 */
211218static void
212- bt_index_check_internal (Oid indrelid ,bool parentcheck ,bool heapallindexed )
219+ bt_index_check_internal (Oid indrelid ,bool parentcheck ,bool heapallindexed ,
220+ bool rootdescend )
213221{
214222Oid heapid ;
215223Relation indrel ;
@@ -267,7 +275,7 @@ bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed)
267275/* Check index, possibly against table it is an index on */
268276heapkeyspace = _bt_heapkeyspace (indrel );
269277bt_check_every_level (indrel ,heaprel ,heapkeyspace ,parentcheck ,
270- heapallindexed );
278+ heapallindexed , rootdescend );
271279
272280/*
273281 * Release locks early. That's ok here because nothing in the called
@@ -338,7 +346,7 @@ btree_index_checkable(Relation rel)
338346 */
339347static void
340348bt_check_every_level (Relation rel ,Relation heaprel ,bool heapkeyspace ,
341- bool readonly ,bool heapallindexed )
349+ bool readonly ,bool heapallindexed , bool rootdescend )
342350{
343351BtreeCheckState * state ;
344352Page metapage ;
@@ -362,6 +370,7 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace,
362370state -> heapkeyspace = heapkeyspace ;
363371state -> readonly = readonly ;
364372state -> heapallindexed = heapallindexed ;
373+ state -> rootdescend = rootdescend ;
365374
366375if (state -> heapallindexed )
367376{
@@ -430,6 +439,14 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace,
430439}
431440}
432441
442+ Assert (!state -> rootdescend || state -> readonly );
443+ if (state -> rootdescend && !state -> heapkeyspace )
444+ ereport (ERROR ,
445+ (errcode (ERRCODE_FEATURE_NOT_SUPPORTED ),
446+ errmsg ("cannot verify that tuples from index \"%s\" can each be found by an independent index search" ,
447+ RelationGetRelationName (rel )),
448+ errhint ("Only B-Tree version 4 indexes support rootdescend verification." )));
449+
433450/* Create context for page */
434451state -> targetcontext = AllocSetContextCreate (CurrentMemoryContext ,
435452"amcheck context" ,
@@ -922,6 +939,31 @@ bt_target_page_check(BtreeCheckState *state)
922939if (offset_is_negative_infinity (topaque ,offset ))
923940continue ;
924941
942+ /*
943+ * Readonly callers may optionally verify that non-pivot tuples can
944+ * each be found by an independent search that starts from the root
945+ */
946+ if (state -> rootdescend && P_ISLEAF (topaque )&&
947+ !bt_rootdescend (state ,itup ))
948+ {
949+ char * itid ,
950+ * htid ;
951+
952+ itid = psprintf ("(%u,%u)" ,state -> targetblock ,offset );
953+ htid = psprintf ("(%u,%u)" ,
954+ ItemPointerGetBlockNumber (& (itup -> t_tid )),
955+ ItemPointerGetOffsetNumber (& (itup -> t_tid )));
956+
957+ ereport (ERROR ,
958+ (errcode (ERRCODE_INDEX_CORRUPTED ),
959+ errmsg ("could not find tuple using search from root page in index \"%s\"" ,
960+ RelationGetRelationName (state -> rel )),
961+ errdetail_internal ("Index tid=%s points to heap tid=%s page lsn=%X/%X." ,
962+ itid ,htid ,
963+ (uint32 ) (state -> targetlsn >>32 ),
964+ (uint32 )state -> targetlsn )));
965+ }
966+
925967/* Build insertion scankey for current page offset */
926968skey = bt_mkscankey_pivotsearch (state -> rel ,itup );
927969
@@ -1526,6 +1568,9 @@ bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey,
15261568 * internal pages. In more general terms, a negative infinity item is
15271569 * only negative infinity with respect to the subtree that the page is
15281570 * at the root of.
1571+ *
1572+ * See also: bt_rootdescend(), which can even detect transitive
1573+ * inconsistencies on cousin leaf pages.
15291574 */
15301575if (offset_is_negative_infinity (copaque ,offset ))
15311576continue ;
@@ -1926,6 +1971,81 @@ bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup)
19261971return reformed ;
19271972}
19281973
1974+ /*
1975+ * Search for itup in index, starting from fast root page. itup must be a
1976+ * non-pivot tuple. This is only supported with heapkeyspace indexes, since
1977+ * we rely on having fully unique keys to find a match with only a signle
1978+ * visit to a leaf page, barring an interrupted page split, where we may have
1979+ * to move right. (A concurrent page split is impossible because caller must
1980+ * be readonly caller.)
1981+ *
1982+ * This routine can detect very subtle transitive consistency issues across
1983+ * more than one level of the tree. Leaf pages all have a high key (even the
1984+ * rightmost page has a conceptual positive infinity high key), but not a low
1985+ * key. Their downlink in parent is a lower bound, which along with the high
1986+ * key is almost enough to detect every possible inconsistency. A downlink
1987+ * separator key value won't always be available from parent, though, because
1988+ * the first items of internal pages are negative infinity items, truncated
1989+ * down to zero attributes during internal page splits. While it's true that
1990+ * bt_downlink_check() and the high key check can detect most imaginable key
1991+ * space problems, there are remaining problems it won't detect with non-pivot
1992+ * tuples in cousin leaf pages. Starting a search from the root for every
1993+ * existing leaf tuple detects small inconsistencies in upper levels of the
1994+ * tree that cannot be detected any other way. (Besides all this, this is
1995+ * probably also useful as a direct test of the code used by index scans
1996+ * themselves.)
1997+ */
1998+ static bool
1999+ bt_rootdescend (BtreeCheckState * state ,IndexTuple itup )
2000+ {
2001+ BTScanInsert key ;
2002+ BTStack stack ;
2003+ Buffer lbuf ;
2004+ bool exists ;
2005+
2006+ key = _bt_mkscankey (state -> rel ,itup );
2007+ Assert (key -> heapkeyspace && key -> scantid != NULL );
2008+
2009+ /*
2010+ * Search from root.
2011+ *
2012+ * Ideally, we would arrange to only move right within _bt_search() when
2013+ * an interrupted page split is detected (i.e. when the incomplete split
2014+ * bit is found to be set), but for now we accept the possibility that
2015+ * that could conceal an inconsistency.
2016+ */
2017+ Assert (state -> readonly && state -> rootdescend );
2018+ exists = false;
2019+ stack = _bt_search (state -> rel ,key ,& lbuf ,BT_READ ,NULL );
2020+
2021+ if (BufferIsValid (lbuf ))
2022+ {
2023+ BTInsertStateData insertstate ;
2024+ OffsetNumber offnum ;
2025+ Page page ;
2026+
2027+ insertstate .itup = itup ;
2028+ insertstate .itemsz = MAXALIGN (IndexTupleSize (itup ));
2029+ insertstate .itup_key = key ;
2030+ insertstate .bounds_valid = false;
2031+ insertstate .buf = lbuf ;
2032+
2033+ /* Get matching tuple on leaf page */
2034+ offnum = _bt_binsrch_insert (state -> rel ,& insertstate );
2035+ /* Compare first >= matching item on leaf page, if any */
2036+ page = BufferGetPage (lbuf );
2037+ if (offnum <=PageGetMaxOffsetNumber (page )&&
2038+ _bt_compare (state -> rel ,key ,page ,offnum )== 0 )
2039+ exists = true;
2040+ _bt_relbuf (state -> rel ,lbuf );
2041+ }
2042+
2043+ _bt_freestack (stack );
2044+ pfree (key );
2045+
2046+ return exists ;
2047+ }
2048+
19292049/*
19302050 * Is particular offset within page (whose special state is passed by caller)
19312051 * the page negative-infinity item?