@@ -146,6 +146,9 @@ static void bt_check_every_level(Relation rel, Relation heaprel,
146
146
bool rootdescend );
147
147
static BtreeLevel bt_check_level_from_leftmost (BtreeCheckState * state ,
148
148
BtreeLevel level );
149
+ static bool bt_leftmost_ignoring_half_dead (BtreeCheckState * state ,
150
+ BlockNumber start ,
151
+ BTPageOpaque start_opaque );
149
152
static void bt_target_page_check (BtreeCheckState * state );
150
153
static BTScanInsert bt_right_page_check_scankey (BtreeCheckState * state );
151
154
static void bt_child_check (BtreeCheckState * state ,BTScanInsert targetkey ,
@@ -770,7 +773,7 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
770
773
*/
771
774
if (state -> readonly )
772
775
{
773
- if (!P_LEFTMOST ( opaque ))
776
+ if (!bt_leftmost_ignoring_half_dead ( state , current , opaque ))
774
777
ereport (ERROR ,
775
778
(errcode (ERRCODE_INDEX_CORRUPTED ),
776
779
errmsg ("block %u is not leftmost in index \"%s\"" ,
@@ -825,10 +828,14 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
825
828
}
826
829
827
830
/*
828
- * readonly mode can only ever land on live pages and half-dead pages,
829
- * so sibling pointers should always be in mutual agreement
831
+ * Sibling links should be in mutual agreement. There arises
832
+ * leftcurrent == P_NONE && btpo_prev != P_NONE when the left sibling
833
+ * of the parent's low-key downlink is half-dead. (A half-dead page
834
+ * has no downlink from its parent.) Under heavyweight locking, the
835
+ * last bt_leftmost_ignoring_half_dead() validated this btpo_prev.
830
836
*/
831
- if (state -> readonly && opaque -> btpo_prev != leftcurrent )
837
+ if (state -> readonly &&
838
+ opaque -> btpo_prev != leftcurrent && leftcurrent != P_NONE )
832
839
ereport (ERROR ,
833
840
(errcode (ERRCODE_INDEX_CORRUPTED ),
834
841
errmsg ("left link/right link pair in index \"%s\" not in agreement" ,
@@ -914,6 +921,67 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
914
921
return nextleveldown ;
915
922
}
916
923
924
+ /*
925
+ * Like P_LEFTMOST(start_opaque), but accept an arbitrarily-long chain of
926
+ * half-dead, sibling-linked pages to the left. If a half-dead page appears
927
+ * under state->readonly, the database exited recovery between the first-stage
928
+ * and second-stage WAL records of a deletion.
929
+ */
930
+ static bool
931
+ bt_leftmost_ignoring_half_dead (BtreeCheckState * state ,
932
+ BlockNumber start ,
933
+ BTPageOpaque start_opaque )
934
+ {
935
+ BlockNumber reached = start_opaque -> btpo_prev ,
936
+ reached_from = start ;
937
+ bool all_half_dead = true;
938
+
939
+ /*
940
+ * To handle the !readonly case, we'd need to accept BTP_DELETED pages and
941
+ * potentially observe nbtree/README "Page deletion and backwards scans".
942
+ */
943
+ Assert (state -> readonly );
944
+
945
+ while (reached != P_NONE && all_half_dead )
946
+ {
947
+ Page page = palloc_btree_page (state ,reached );
948
+ BTPageOpaque reached_opaque = (BTPageOpaque )PageGetSpecialPointer (page );
949
+
950
+ CHECK_FOR_INTERRUPTS ();
951
+
952
+ /*
953
+ * Try to detect btpo_prev circular links. _bt_unlink_halfdead_page()
954
+ * writes that side-links will continue to point to the siblings.
955
+ * Check btpo_next for that property.
956
+ */
957
+ all_half_dead = P_ISHALFDEAD (reached_opaque )&&
958
+ reached != start &&
959
+ reached != reached_from &&
960
+ reached_opaque -> btpo_next == reached_from ;
961
+ if (all_half_dead )
962
+ {
963
+ XLogRecPtr pagelsn = PageGetLSN (page );
964
+
965
+ /* pagelsn should point to an XLOG_BTREE_MARK_PAGE_HALFDEAD */
966
+ ereport (DEBUG1 ,
967
+ (errcode (ERRCODE_NO_DATA ),
968
+ errmsg_internal ("harmless interrupted page deletion detected in index \"%s\"" ,
969
+ RelationGetRelationName (state -> rel )),
970
+ errdetail_internal ("Block=%u right block=%u page lsn=%X/%X." ,
971
+ reached ,reached_from ,
972
+ (uint32 ) (pagelsn >>32 ),
973
+ (uint32 )pagelsn )));
974
+
975
+ reached_from = reached ;
976
+ reached = reached_opaque -> btpo_prev ;
977
+ }
978
+
979
+ pfree (page );
980
+ }
981
+
982
+ return all_half_dead ;
983
+ }
984
+
917
985
/*
918
986
* Function performs the following checks on target page, or pages ancillary to
919
987
* target page:
@@ -1809,7 +1877,8 @@ bt_child_highkey_check(BtreeCheckState *state,
1809
1877
opaque = (BTPageOpaque )PageGetSpecialPointer (page );
1810
1878
1811
1879
/* The first page we visit at the level should be leftmost */
1812
- if (first && !BlockNumberIsValid (state -> prevrightlink )&& !P_LEFTMOST (opaque ))
1880
+ if (first && !BlockNumberIsValid (state -> prevrightlink )&&
1881
+ !bt_leftmost_ignoring_half_dead (state ,blkno ,opaque ))
1813
1882
ereport (ERROR ,
1814
1883
(errcode (ERRCODE_INDEX_CORRUPTED ),
1815
1884
errmsg ("the first child of leftmost target page is not leftmost of its level in index \"%s\"" ,