1212 * Portions Copyright (c) 1994, Regents of the University of California
1313 *
1414 * IDENTIFICATION
15- * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.92 2002/09/04 20:31:10 momjian Exp $
15+ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.93 2002/10/20 20:47:31 tgl Exp $
1616 *
1717 *-------------------------------------------------------------------------
1818 */
@@ -603,12 +603,21 @@ btbulkdelete(PG_FUNCTION_ARGS)
603603 * loop, we skip most of the wrapper layers of index_getnext and
604604 * instead call _bt_step directly.This implies holding buffer lock
605605 * on a target page throughout the loop over the page's tuples.
606- * Initially, we have a read lock acquired by _bt_step when we stepped
607- * onto the page. If we find a tuple we need to delete, we trade in
608- * the read lock for an exclusive write lock; after that, we hold the
609- * write lock until we step off the page (fortunately, _bt_relbuf
610- * doesn't care which kind of lock it's releasing). This should
611- * minimize the amount of work needed per page.
606+ *
607+ * Whenever we step onto a new page, we have to trade in the read
608+ * lock acquired by _bt_first or _bt_step for an exclusive write lock
609+ * (fortunately, _bt_relbuf doesn't care which kind of lock it's
610+ * releasing when it comes time for _bt_step to release our lock).
611+ *
612+ * Note that we exclusive-lock every leaf page, or at least every one
613+ * containing data items. It sounds attractive to only exclusive-lock
614+ * those containing items we need to delete, but unfortunately that
615+ * is not safe: we could then pass a stopped indexscan, which could
616+ * in rare cases lead to deleting the item it needs to find when it
617+ * resumes. (See _bt_restscan --- this could only happen if an indexscan
618+ * stops on a deletable item and then a page split moves that item
619+ * into a page further to its right, which the indexscan will have no
620+ * pin on.)
612621 */
613622scan = index_beginscan (NULL ,rel ,SnapshotAny ,0 , (ScanKey )NULL );
614623so = (BTScanOpaque )scan -> opaque ;
@@ -620,7 +629,7 @@ btbulkdelete(PG_FUNCTION_ARGS)
620629Buffer buf ;
621630BlockNumber lockedBlock = InvalidBlockNumber ;
622631
623- /* we have the buffer pinned and locked */
632+ /* we have the buffer pinned andread- locked */
624633buf = so -> btso_curbuf ;
625634Assert (BufferIsValid (buf ));
626635
@@ -637,65 +646,59 @@ btbulkdelete(PG_FUNCTION_ARGS)
637646CHECK_FOR_INTERRUPTS ();
638647
639648/* current is the next index tuple */
649+ page = BufferGetPage (buf );
640650blkno = ItemPointerGetBlockNumber (current );
651+
652+ /*
653+ * Make sure we have a super-exclusive write lock on this page.
654+ *
655+ * We assume that only concurrent insertions, not deletions,
656+ * can occur while we're not holding the page lock (the
657+ * caller should hold a suitable relation lock to ensure
658+ * this). Therefore, no items can escape being scanned because
659+ * of this temporary lock release.
660+ */
661+ if (blkno != lockedBlock )
662+ {
663+ LockBuffer (buf ,BUFFER_LOCK_UNLOCK );
664+ LockBufferForCleanup (buf );
665+ lockedBlock = blkno ;
666+ /*
667+ * If the page was formerly rightmost but was split while we
668+ * didn't hold the lock, and ip_posid is pointing to item
669+ * 1, then ip_posid now points at the high key not a valid
670+ * data item. In this case we need to step forward.
671+ */
672+ opaque = (BTPageOpaque )PageGetSpecialPointer (page );
673+ if (current -> ip_posid < P_FIRSTDATAKEY (opaque ))
674+ current -> ip_posid = P_FIRSTDATAKEY (opaque );
675+ }
676+
641677offnum = ItemPointerGetOffsetNumber (current );
642- page = BufferGetPage (buf );
643678btitem = (BTItem )PageGetItem (page ,PageGetItemId (page ,offnum ));
644679itup = & btitem -> bti_itup ;
645680htup = & (itup -> t_tid );
646681
647682if (callback (htup ,callback_state ))
648683{
649- /*
650- * If this is first deletion on this page, trade in read
651- * lock for a really-exclusive write lock.Then, step
652- * back one and re-examine the item, because other
653- * backends might have inserted item(s) while we weren't
654- * holding the lock!
655- *
656- * We assume that only concurrent insertions, not deletions,
657- * can occur while we're not holding the page lock (the
658- * caller should hold a suitable relation lock to ensure
659- * this). Therefore, the item we want to delete is either
660- * in the same slot as before, or some slot to its right.
661- * Rechecking the same slot is necessary and sufficient to
662- * get back in sync after any insertions.
663- */
664- if (blkno != lockedBlock )
665- {
666- LockBuffer (buf ,BUFFER_LOCK_UNLOCK );
667- LockBufferForCleanup (buf );
668- lockedBlock = blkno ;
669- }
670- else
671- {
672- /* Okay to delete the item from the page */
673- _bt_itemdel (rel ,buf ,current );
674-
675- /* Mark buffer dirty, but keep the lock and pin */
676- WriteNoReleaseBuffer (buf );
677-
678- tuples_removed += 1 ;
679- }
684+ /* Okay to delete the item from the page */
685+ _bt_itemdel (rel ,buf ,current );
686+
687+ /* Mark buffer dirty, but keep the lock and pin */
688+ WriteNoReleaseBuffer (buf );
689+
690+ tuples_removed += 1 ;
680691
681692/*
682- *In either case, we now need to back up the scan one
683- *item, so that the next cycle will re-examine the same
684- *offnum on this page .
693+ *We now need to back up the scan one item, so that the next
694+ * cycle will re-examine the same offnum on this page (which
695+ *now holds the next item) .
685696 *
686697 * For now, just hack the current-item index. Will need to
687698 * be smarter when deletion includes removal of empty
688699 * index pages.
689- *
690- * We must decrement ip_posid in all cases but one: if the
691- * page was formerly rightmost but was split while we
692- * didn't hold the lock, and ip_posid is pointing to item
693- * 1, then ip_posid now points at the high key not a valid
694- * data item. In this case we do want to step forward.
695700 */
696- opaque = (BTPageOpaque )PageGetSpecialPointer (page );
697- if (current -> ip_posid >=P_FIRSTDATAKEY (opaque ))
698- current -> ip_posid -- ;
701+ current -> ip_posid -- ;
699702}
700703else
701704num_index_tuples += 1 ;
@@ -717,6 +720,16 @@ btbulkdelete(PG_FUNCTION_ARGS)
717720
718721/*
719722 * Restore scan position when btgettuple is called to continue a scan.
723+ *
724+ * This is nontrivial because concurrent insertions might have moved the
725+ * index tuple we stopped on. We assume the tuple can only have moved to
726+ * the right from our stop point, because we kept a pin on the buffer,
727+ * and so no deletion can have occurred on that page.
728+ *
729+ * On entry, we have a pin but no read lock on the buffer that contained
730+ * the index tuple we stopped the scan on. On exit, we have pin and read
731+ * lock on the buffer that now contains that index tuple, and the scandesc's
732+ * current position is updated to point at it.
720733 */
721734static void
722735_bt_restscan (IndexScanDesc scan )
@@ -729,13 +742,14 @@ _bt_restscan(IndexScanDesc scan)
729742OffsetNumber offnum = ItemPointerGetOffsetNumber (current ),
730743maxoff ;
731744BTPageOpaque opaque ;
745+ Buffer nextbuf ;
732746ItemPointerData target = so -> curHeapIptr ;
733747BTItem item ;
734748BlockNumber blkno ;
735749
736750/*
737- *Get back the read lockwe were holding on the buffer. (We still
738- *have a reference-count pin on it, so need not get that.)
751+ *Reacquire read lock on the buffer. (Weshould still have
752+ * a reference-count pin on it, so need not get that.)
739753 */
740754LockBuffer (buf ,BT_READ );
741755
@@ -747,7 +761,7 @@ _bt_restscan(IndexScanDesc scan)
747761 * We use this as flag when first index tuple on page is deleted but
748762 * we do not move left (this would slowdown vacuum) - so we set
749763 * current->ip_posid before first index tuple on the current page
750- * (_bt_step will move it right)...
764+ * (_bt_step will move it right)... XXX still needed?
751765 */
752766if (!ItemPointerIsValid (& target ))
753767{
@@ -758,7 +772,7 @@ _bt_restscan(IndexScanDesc scan)
758772
759773/*
760774 * The item we were on may have moved right due to insertions. Find it
761- * again.
775+ * again. We use the heap TID to identify the item uniquely.
762776 */
763777for (;;)
764778{
@@ -774,28 +788,33 @@ _bt_restscan(IndexScanDesc scan)
774788target .ip_blkid .bi_lo &&
775789item -> bti_itup .t_tid .ip_posid == target .ip_posid )
776790{
791+ /* Found it */
777792current -> ip_posid = offnum ;
778793return ;
779794}
780795}
781796
782797/*
783- * By here, the item we're looking for moved right at least one
784- * page
798+ * The item we're looking for moved right at least one page, so
799+ * move right. We are careful here to pin and read-lock the next
800+ * page before releasing the current one. This ensures that a
801+ * concurrent btbulkdelete scan cannot pass our position --- if it
802+ * did, it might be able to reach and delete our target item before
803+ * we can find it again.
785804 */
786805if (P_RIGHTMOST (opaque ))
787806elog (FATAL ,"_bt_restscan: my bits moved right off the end of the world!"
788807"\n\tRecreate index %s." ,RelationGetRelationName (rel ));
789808
790809blkno = opaque -> btpo_next ;
810+ nextbuf = _bt_getbuf (rel ,blkno ,BT_READ );
791811_bt_relbuf (rel ,buf );
792- buf = _bt_getbuf ( rel , blkno , BT_READ ) ;
812+ so -> btso_curbuf = buf = nextbuf ;
793813page = BufferGetPage (buf );
794814maxoff = PageGetMaxOffsetNumber (page );
795815opaque = (BTPageOpaque )PageGetSpecialPointer (page );
796816offnum = P_FIRSTDATAKEY (opaque );
797817ItemPointerSet (current ,blkno ,offnum );
798- so -> btso_curbuf = buf ;
799818}
800819}
801820