2525#include "storage/indexfsm.h"
2626#include "storage/lmgr.h"
2727#include "storage/procarray.h"
28+ #include "utils/snapmgr.h"
2829
2930
30- /* local state for vacuum operations */
31+ /* Entry in pending-list of TIDs we need to revisit */
32+ typedef struct spgVacPendingItem
33+ {
34+ ItemPointerData tid ;/* redirection target to visit */
35+ bool done ;/* have we dealt with this? */
36+ struct spgVacPendingItem * next ;/* list link */
37+ }spgVacPendingItem ;
38+
39+ /* Local state for vacuum operations */
3140typedef struct spgBulkDeleteState
3241{
3342/* Parameters passed in to spgvacuumscan */
3443IndexVacuumInfo * info ;
3544IndexBulkDeleteResult * stats ;
3645IndexBulkDeleteCallback callback ;
3746void * callback_state ;
47+
3848/* Additional working state */
39- SpGistState spgstate ;
40- TransactionId OldestXmin ;
41- BlockNumber lastFilledBlock ;
49+ SpGistState spgstate ;/* for SPGiST operations that need one */
50+ spgVacPendingItem * pendingList ;/* TIDs we need to (re)visit */
51+ TransactionId myXmin ;/* for detecting newly-added redirects */
52+ TransactionId OldestXmin ;/* for deciding a redirect is obsolete */
53+ BlockNumber lastFilledBlock ;/* last non-deletable block */
4254}spgBulkDeleteState ;
4355
4456
57+ /*
58+ * Add TID to pendingList, but only if not already present.
59+ *
60+ * Note that new items are always appended at the end of the list; this
61+ * ensures that scans of the list don't miss items added during the scan.
62+ */
63+ static void
64+ spgAddPendingTID (spgBulkDeleteState * bds ,ItemPointer tid )
65+ {
66+ spgVacPendingItem * pitem ;
67+ spgVacPendingItem * * listLink ;
68+
69+ /* search the list for pre-existing entry */
70+ listLink = & bds -> pendingList ;
71+ while (* listLink != NULL )
72+ {
73+ pitem = * listLink ;
74+ if (ItemPointerEquals (tid ,& pitem -> tid ))
75+ return ;/* already in list, do nothing */
76+ listLink = & pitem -> next ;
77+ }
78+ /* not there, so append new entry */
79+ pitem = (spgVacPendingItem * )palloc (sizeof (spgVacPendingItem ));
80+ pitem -> tid = * tid ;
81+ pitem -> done = false;
82+ pitem -> next = NULL ;
83+ * listLink = pitem ;
84+ }
85+
86+ /*
87+ * Clear pendingList
88+ */
89+ static void
90+ spgClearPendingList (spgBulkDeleteState * bds )
91+ {
92+ spgVacPendingItem * pitem ;
93+ spgVacPendingItem * nitem ;
94+
95+ for (pitem = bds -> pendingList ;pitem != NULL ;pitem = nitem )
96+ {
97+ nitem = pitem -> next ;
98+ /* All items in list should have been dealt with */
99+ Assert (pitem -> done );
100+ pfree (pitem );
101+ }
102+ bds -> pendingList = NULL ;
103+ }
104+
45105/*
46106 * Vacuum a regular (non-root) leaf page
47107 *
48108 * We must delete tuples that are targeted for deletion by the VACUUM,
49109 * but not move any tuples that are referenced by outside links; we assume
50110 * those are the ones that are heads of chains.
111+ *
112+ * If we find a REDIRECT that was made by a concurrently-running transaction,
113+ * we must add its target TID to pendingList. (We don't try to visit the
114+ * target immediately, first because we don't want VACUUM locking more than
115+ * one buffer at a time, and second because the duplicate-filtering logic
116+ * in spgAddPendingTID is useful to ensure we can't get caught in an infinite
117+ * loop in the face of continuous concurrent insertions.)
118+ *
119+ * If forPending is true, we are examining the page as a consequence of
120+ * chasing a redirect link, not as part of the normal sequential scan.
121+ * We still vacuum the page normally, but we don't increment the stats
122+ * about live tuples; else we'd double-count those tuples, since the page
123+ * has been or will be visited in the sequential scan as well.
51124 */
52125static void
53- vacuumLeafPage (spgBulkDeleteState * bds ,Relation index ,Buffer buffer )
126+ vacuumLeafPage (spgBulkDeleteState * bds ,Relation index ,Buffer buffer ,
127+ bool forPending )
54128{
55129Page page = BufferGetPage (buffer );
56130spgxlogVacuumLeaf xlrec ;
@@ -90,7 +164,8 @@ vacuumLeafPage(spgBulkDeleteState *bds, Relation index, Buffer buffer)
90164}
91165else
92166{
93- bds -> stats -> num_index_tuples += 1 ;
167+ if (!forPending )
168+ bds -> stats -> num_index_tuples += 1 ;
94169}
95170
96171/* Form predecessor map, too */
@@ -106,6 +181,25 @@ vacuumLeafPage(spgBulkDeleteState *bds, Relation index, Buffer buffer)
106181predecessor [lt -> nextOffset ]= i ;
107182}
108183}
184+ else if (lt -> tupstate == SPGIST_REDIRECT )
185+ {
186+ SpGistDeadTuple dt = (SpGistDeadTuple )lt ;
187+
188+ Assert (dt -> nextOffset == InvalidOffsetNumber );
189+ Assert (ItemPointerIsValid (& dt -> pointer ));
190+
191+ /*
192+ * Add target TID to pending list if the redirection could have
193+ * happened since VACUUM started.
194+ *
195+ * Note: we could make a tighter test by seeing if the xid is
196+ * "running" according to the active snapshot; but tqual.c doesn't
197+ * currently export a suitable API, and it's not entirely clear
198+ * that a tighter test is worth the cycles anyway.
199+ */
200+ if (TransactionIdFollowsOrEquals (dt -> xid ,bds -> myXmin ))
201+ spgAddPendingTID (bds ,& dt -> pointer );
202+ }
109203else
110204{
111205Assert (lt -> nextOffset == InvalidOffsetNumber );
@@ -545,7 +639,7 @@ spgvacuumpage(spgBulkDeleteState *bds, BlockNumber blkno)
545639}
546640else
547641{
548- vacuumLeafPage (bds ,index ,buffer );
642+ vacuumLeafPage (bds ,index ,buffer , false );
549643vacuumRedirectAndPlaceholder (index ,buffer ,bds -> OldestXmin );
550644}
551645}
@@ -556,8 +650,8 @@ spgvacuumpage(spgBulkDeleteState *bds, BlockNumber blkno)
556650}
557651
558652/*
559- * The rootpage must never be deleted, nor marked as available in FSM,
560- * because we don't wantit ever returned by a search for a place to
653+ * The rootpages must never be deleted, nor marked as available in FSM,
654+ * because we don't wantthem ever returned by a search for a place to
561655 * put a new tuple. Otherwise, check for empty/deletable page, and
562656 * make sure FSM knows about it.
563657 */
@@ -585,6 +679,118 @@ spgvacuumpage(spgBulkDeleteState *bds, BlockNumber blkno)
585679UnlockReleaseBuffer (buffer );
586680}
587681
682+ /*
683+ * Process the pending-TID list between pages of the main scan
684+ */
685+ static void
686+ spgprocesspending (spgBulkDeleteState * bds )
687+ {
688+ Relation index = bds -> info -> index ;
689+ spgVacPendingItem * pitem ;
690+ spgVacPendingItem * nitem ;
691+ BlockNumber blkno ;
692+ Buffer buffer ;
693+ Page page ;
694+
695+ for (pitem = bds -> pendingList ;pitem != NULL ;pitem = pitem -> next )
696+ {
697+ if (pitem -> done )
698+ continue ;/* ignore already-done items */
699+
700+ /* call vacuum_delay_point while not holding any buffer lock */
701+ vacuum_delay_point ();
702+
703+ /* examine the referenced page */
704+ blkno = ItemPointerGetBlockNumber (& pitem -> tid );
705+ buffer = ReadBufferExtended (index ,MAIN_FORKNUM ,blkno ,
706+ RBM_NORMAL ,bds -> info -> strategy );
707+ LockBuffer (buffer ,BUFFER_LOCK_EXCLUSIVE );
708+ page = (Page )BufferGetPage (buffer );
709+
710+ if (PageIsNew (page )|| SpGistPageIsDeleted (page ))
711+ {
712+ /* Probably shouldn't happen, but ignore it */
713+ }
714+ else if (SpGistPageIsLeaf (page ))
715+ {
716+ if (SpGistBlockIsRoot (blkno ))
717+ {
718+ /* this should definitely not happen */
719+ elog (ERROR ,"redirection leads to root page of index \"%s\"" ,
720+ RelationGetRelationName (index ));
721+ }
722+
723+ /* deal with any deletable tuples */
724+ vacuumLeafPage (bds ,index ,buffer , true);
725+ /* might as well do this while we are here */
726+ vacuumRedirectAndPlaceholder (index ,buffer ,bds -> OldestXmin );
727+
728+ SpGistSetLastUsedPage (index ,buffer );
729+
730+ /*
731+ * We can mark as done not only this item, but any later ones
732+ * pointing at the same page, since we vacuumed the whole page.
733+ */
734+ pitem -> done = true;
735+ for (nitem = pitem -> next ;nitem != NULL ;nitem = nitem -> next )
736+ {
737+ if (ItemPointerGetBlockNumber (& nitem -> tid )== blkno )
738+ nitem -> done = true;
739+ }
740+ }
741+ else
742+ {
743+ /*
744+ * On an inner page, visit the referenced inner tuple and add
745+ * all its downlinks to the pending list. We might have pending
746+ * items for more than one inner tuple on the same page (in fact
747+ * this is pretty likely given the way space allocation works),
748+ * so get them all while we are here.
749+ */
750+ for (nitem = pitem ;nitem != NULL ;nitem = nitem -> next )
751+ {
752+ if (nitem -> done )
753+ continue ;
754+ if (ItemPointerGetBlockNumber (& nitem -> tid )== blkno )
755+ {
756+ OffsetNumber offset ;
757+ SpGistInnerTuple innerTuple ;
758+
759+ offset = ItemPointerGetOffsetNumber (& nitem -> tid );
760+ innerTuple = (SpGistInnerTuple )PageGetItem (page ,
761+ PageGetItemId (page ,offset ));
762+ if (innerTuple -> tupstate == SPGIST_LIVE )
763+ {
764+ SpGistNodeTuple node ;
765+ int i ;
766+
767+ SGITITERATE (innerTuple ,i ,node )
768+ {
769+ if (ItemPointerIsValid (& node -> t_tid ))
770+ spgAddPendingTID (bds ,& node -> t_tid );
771+ }
772+ }
773+ else if (innerTuple -> tupstate == SPGIST_REDIRECT )
774+ {
775+ /* transfer attention to redirect point */
776+ spgAddPendingTID (bds ,
777+ & ((SpGistDeadTuple )innerTuple )-> pointer );
778+ }
779+ else
780+ elog (ERROR ,"unexpected SPGiST tuple state: %d" ,
781+ innerTuple -> tupstate );
782+
783+ nitem -> done = true;
784+ }
785+ }
786+ }
787+
788+ UnlockReleaseBuffer (buffer );
789+ }
790+
791+ spgClearPendingList (bds );
792+ }
793+
588794/*
589795 * Perform a bulkdelete scan
590796 */
@@ -598,6 +804,8 @@ spgvacuumscan(spgBulkDeleteState *bds)
598804
599805/* Finish setting up spgBulkDeleteState */
600806initSpGistState (& bds -> spgstate ,index );
807+ bds -> pendingList = NULL ;
808+ bds -> myXmin = GetActiveSnapshot ()-> xmin ;
601809bds -> OldestXmin = GetOldestXmin (true, false);
602810bds -> lastFilledBlock = SPGIST_LAST_FIXED_BLKNO ;
603811
@@ -637,6 +845,9 @@ spgvacuumscan(spgBulkDeleteState *bds)
637845for (;blkno < num_pages ;blkno ++ )
638846{
639847spgvacuumpage (bds ,blkno );
848+ /* empty the pending-list after each page */
849+ if (bds -> pendingList != NULL )
850+ spgprocesspending (bds );
640851}
641852}
642853
@@ -747,7 +958,7 @@ spgvacuumcleanup(PG_FUNCTION_ARGS)
747958IndexFreeSpaceMapVacuum (index );
748959
749960/*
750- * It's quite possible for us to be fooled by concurrentpage splits into
961+ * It's quite possible for us to be fooled by concurrenttuple moves into
751962 * double-counting some index tuples, so disbelieve any total that exceeds
752963 * the underlying heap's count ... if we know that accurately. Otherwise
753964 * this might just make matters worse.