@@ -146,6 +146,9 @@ static void bt_check_every_level(Relation rel, Relation heaprel,
146146bool rootdescend );
147147static BtreeLevel bt_check_level_from_leftmost (BtreeCheckState * state ,
148148BtreeLevel level );
149+ static bool bt_leftmost_ignoring_half_dead (BtreeCheckState * state ,
150+ BlockNumber start ,
151+ BTPageOpaque start_opaque );
149152static void bt_recheck_sibling_links (BtreeCheckState * state ,
150153BlockNumber btpo_prev_from_target ,
151154BlockNumber leftcurrent );
@@ -774,7 +777,7 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
774777 */
775778if (state -> readonly )
776779{
777- if (!P_LEFTMOST ( opaque ))
780+ if (!bt_leftmost_ignoring_half_dead ( state , current , opaque ))
778781ereport (ERROR ,
779782(errcode (ERRCODE_INDEX_CORRUPTED ),
780783errmsg ("block %u is not leftmost in index \"%s\"" ,
@@ -828,8 +831,16 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
828831 */
829832}
830833
831- /* Sibling links should be in mutual agreement */
832- if (opaque -> btpo_prev != leftcurrent )
834+ /*
835+ * Sibling links should be in mutual agreement. There arises
836+ * leftcurrent == P_NONE && btpo_prev != P_NONE when the left sibling
837+ * of the parent's low-key downlink is half-dead. (A half-dead page
838+ * has no downlink from its parent.) Under heavyweight locking, the
839+ * last bt_leftmost_ignoring_half_dead() validated this btpo_prev.
840+ * Without heavyweight locking, validation of the P_NONE case remains
841+ * unimplemented.
842+ */
843+ if (opaque -> btpo_prev != leftcurrent && leftcurrent != P_NONE )
833844bt_recheck_sibling_links (state ,opaque -> btpo_prev ,leftcurrent );
834845
835846/* Check level */
@@ -910,6 +921,66 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
910921return nextleveldown ;
911922}
912923
924+ /*
925+ * Like P_LEFTMOST(start_opaque), but accept an arbitrarily-long chain of
926+ * half-dead, sibling-linked pages to the left. If a half-dead page appears
927+ * under state->readonly, the database exited recovery between the first-stage
928+ * and second-stage WAL records of a deletion.
929+ */
930+ static bool
931+ bt_leftmost_ignoring_half_dead (BtreeCheckState * state ,
932+ BlockNumber start ,
933+ BTPageOpaque start_opaque )
934+ {
935+ BlockNumber reached = start_opaque -> btpo_prev ,
936+ reached_from = start ;
937+ bool all_half_dead = true;
938+
939+ /*
940+ * To handle the !readonly case, we'd need to accept BTP_DELETED pages and
941+ * potentially observe nbtree/README "Page deletion and backwards scans".
942+ */
943+ Assert (state -> readonly );
944+
945+ while (reached != P_NONE && all_half_dead )
946+ {
947+ Page page = palloc_btree_page (state ,reached );
948+ BTPageOpaque reached_opaque = (BTPageOpaque )PageGetSpecialPointer (page );
949+
950+ CHECK_FOR_INTERRUPTS ();
951+
952+ /*
953+ * Try to detect btpo_prev circular links. _bt_unlink_halfdead_page()
954+ * writes that side-links will continue to point to the siblings.
955+ * Check btpo_next for that property.
956+ */
957+ all_half_dead = P_ISHALFDEAD (reached_opaque )&&
958+ reached != start &&
959+ reached != reached_from &&
960+ reached_opaque -> btpo_next == reached_from ;
961+ if (all_half_dead )
962+ {
963+ XLogRecPtr pagelsn = PageGetLSN (page );
964+
965+ /* pagelsn should point to an XLOG_BTREE_MARK_PAGE_HALFDEAD */
966+ ereport (DEBUG1 ,
967+ (errcode (ERRCODE_NO_DATA ),
968+ errmsg_internal ("harmless interrupted page deletion detected in index \"%s\"" ,
969+ RelationGetRelationName (state -> rel )),
970+ errdetail_internal ("Block=%u right block=%u page lsn=%X/%X." ,
971+ reached ,reached_from ,
972+ LSN_FORMAT_ARGS (pagelsn ))));
973+
974+ reached_from = reached ;
975+ reached = reached_opaque -> btpo_prev ;
976+ }
977+
978+ pfree (page );
979+ }
980+
981+ return all_half_dead ;
982+ }
983+
913984/*
914985 * Raise an error when target page's left link does not point back to the
915986 * previous target page, called leftcurrent here. The leftcurrent page's
@@ -950,6 +1021,9 @@ bt_recheck_sibling_links(BtreeCheckState *state,
9501021BlockNumber btpo_prev_from_target ,
9511022BlockNumber leftcurrent )
9521023{
1024+ /* taking BTPageOpaque from metapage would give irrelevant findings */
1025+ Assert (leftcurrent != P_NONE );
1026+
9531027if (!state -> readonly )
9541028{
9551029Buffer lbuf ;
@@ -1933,7 +2007,8 @@ bt_child_highkey_check(BtreeCheckState *state,
19332007opaque = (BTPageOpaque )PageGetSpecialPointer (page );
19342008
19352009/* The first page we visit at the level should be leftmost */
1936- if (first && !BlockNumberIsValid (state -> prevrightlink )&& !P_LEFTMOST (opaque ))
2010+ if (first && !BlockNumberIsValid (state -> prevrightlink )&&
2011+ !bt_leftmost_ignoring_half_dead (state ,blkno ,opaque ))
19372012ereport (ERROR ,
19382013(errcode (ERRCODE_INDEX_CORRUPTED ),
19392014errmsg ("the first child of leftmost target page is not leftmost of its level in index \"%s\"" ,