@@ -146,6 +146,9 @@ static void bt_check_every_level(Relation rel, Relation heaprel,
146
146
bool rootdescend );
147
147
static BtreeLevel bt_check_level_from_leftmost (BtreeCheckState * state ,
148
148
BtreeLevel level );
149
+ static bool bt_leftmost_ignoring_half_dead (BtreeCheckState * state ,
150
+ BlockNumber start ,
151
+ BTPageOpaque start_opaque );
149
152
static void bt_recheck_sibling_links (BtreeCheckState * state ,
150
153
BlockNumber btpo_prev_from_target ,
151
154
BlockNumber leftcurrent );
@@ -774,7 +777,7 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
774
777
*/
775
778
if (state -> readonly )
776
779
{
777
- if (!P_LEFTMOST ( opaque ))
780
+ if (!bt_leftmost_ignoring_half_dead ( state , current , opaque ))
778
781
ereport (ERROR ,
779
782
(errcode (ERRCODE_INDEX_CORRUPTED ),
780
783
errmsg ("block %u is not leftmost in index \"%s\"" ,
@@ -828,8 +831,16 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
828
831
*/
829
832
}
830
833
831
- /* Sibling links should be in mutual agreement */
832
- if (opaque -> btpo_prev != leftcurrent )
834
+ /*
835
+ * Sibling links should be in mutual agreement. There arises
836
+ * leftcurrent == P_NONE && btpo_prev != P_NONE when the left sibling
837
+ * of the parent's low-key downlink is half-dead. (A half-dead page
838
+ * has no downlink from its parent.) Under heavyweight locking, the
839
+ * last bt_leftmost_ignoring_half_dead() validated this btpo_prev.
840
+ * Without heavyweight locking, validation of the P_NONE case remains
841
+ * unimplemented.
842
+ */
843
+ if (opaque -> btpo_prev != leftcurrent && leftcurrent != P_NONE )
833
844
bt_recheck_sibling_links (state ,opaque -> btpo_prev ,leftcurrent );
834
845
835
846
/* Check level */
@@ -910,6 +921,66 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
910
921
return nextleveldown ;
911
922
}
912
923
924
+ /*
925
+ * Like P_LEFTMOST(start_opaque), but accept an arbitrarily-long chain of
926
+ * half-dead, sibling-linked pages to the left. If a half-dead page appears
927
+ * under state->readonly, the database exited recovery between the first-stage
928
+ * and second-stage WAL records of a deletion.
929
+ */
930
+ static bool
931
+ bt_leftmost_ignoring_half_dead (BtreeCheckState * state ,
932
+ BlockNumber start ,
933
+ BTPageOpaque start_opaque )
934
+ {
935
+ BlockNumber reached = start_opaque -> btpo_prev ,
936
+ reached_from = start ;
937
+ bool all_half_dead = true;
938
+
939
+ /*
940
+ * To handle the !readonly case, we'd need to accept BTP_DELETED pages and
941
+ * potentially observe nbtree/README "Page deletion and backwards scans".
942
+ */
943
+ Assert (state -> readonly );
944
+
945
+ while (reached != P_NONE && all_half_dead )
946
+ {
947
+ Page page = palloc_btree_page (state ,reached );
948
+ BTPageOpaque reached_opaque = (BTPageOpaque )PageGetSpecialPointer (page );
949
+
950
+ CHECK_FOR_INTERRUPTS ();
951
+
952
+ /*
953
+ * Try to detect btpo_prev circular links. _bt_unlink_halfdead_page()
954
+ * writes that side-links will continue to point to the siblings.
955
+ * Check btpo_next for that property.
956
+ */
957
+ all_half_dead = P_ISHALFDEAD (reached_opaque )&&
958
+ reached != start &&
959
+ reached != reached_from &&
960
+ reached_opaque -> btpo_next == reached_from ;
961
+ if (all_half_dead )
962
+ {
963
+ XLogRecPtr pagelsn = PageGetLSN (page );
964
+
965
+ /* pagelsn should point to an XLOG_BTREE_MARK_PAGE_HALFDEAD */
966
+ ereport (DEBUG1 ,
967
+ (errcode (ERRCODE_NO_DATA ),
968
+ errmsg_internal ("harmless interrupted page deletion detected in index \"%s\"" ,
969
+ RelationGetRelationName (state -> rel )),
970
+ errdetail_internal ("Block=%u right block=%u page lsn=%X/%X." ,
971
+ reached ,reached_from ,
972
+ LSN_FORMAT_ARGS (pagelsn ))));
973
+
974
+ reached_from = reached ;
975
+ reached = reached_opaque -> btpo_prev ;
976
+ }
977
+
978
+ pfree (page );
979
+ }
980
+
981
+ return all_half_dead ;
982
+ }
983
+
913
984
/*
914
985
* Raise an error when target page's left link does not point back to the
915
986
* previous target page, called leftcurrent here. The leftcurrent page's
@@ -950,6 +1021,9 @@ bt_recheck_sibling_links(BtreeCheckState *state,
950
1021
BlockNumber btpo_prev_from_target ,
951
1022
BlockNumber leftcurrent )
952
1023
{
1024
+ /* taking BTPageOpaque from metapage would give irrelevant findings */
1025
+ Assert (leftcurrent != P_NONE );
1026
+
953
1027
if (!state -> readonly )
954
1028
{
955
1029
Buffer lbuf ;
@@ -1933,7 +2007,8 @@ bt_child_highkey_check(BtreeCheckState *state,
1933
2007
opaque = (BTPageOpaque )PageGetSpecialPointer (page );
1934
2008
1935
2009
/* The first page we visit at the level should be leftmost */
1936
- if (first && !BlockNumberIsValid (state -> prevrightlink )&& !P_LEFTMOST (opaque ))
2010
+ if (first && !BlockNumberIsValid (state -> prevrightlink )&&
2011
+ !bt_leftmost_ignoring_half_dead (state ,blkno ,opaque ))
1937
2012
ereport (ERROR ,
1938
2013
(errcode (ERRCODE_INDEX_CORRUPTED ),
1939
2014
errmsg ("the first child of leftmost target page is not leftmost of its level in index \"%s\"" ,