Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit3bbd6af

Browse files
committed
Adjust btbulkdelete logic so that only one WAL record is issued while
deleting multiple index entries on a single index page. This makes fora very substantial reduction in the amount of WAL traffic during alarge delete operation.
1 parent13dadef commit3bbd6af

File tree

4 files changed

+135
-107
lines changed

4 files changed

+135
-107
lines changed

‎src/backend/access/nbtree/nbtpage.c

Lines changed: 33 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
*
1010
*
1111
* IDENTIFICATION
12-
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.61 2003/02/2306:17:13 tgl Exp $
12+
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.62 2003/02/2322:43:08 tgl Exp $
1313
*
1414
*NOTES
1515
* Postgres btree pages look like ordinary relation pages.The opaque
@@ -618,26 +618,34 @@ _bt_metaproot(Relation rel, BlockNumber rootbknum, uint32 level)
618618
}
619619

620620
/*
621-
* Deleteanitem from a btree page.
621+
* Delete item(s) from a btree page.
622622
*
623623
* This must only be used for deleting leaf items. Deleting an item on a
624624
* non-leaf page has to be done as part of an atomic action that includes
625625
* deleting the page it points to.
626626
*
627627
* This routine assumes that the caller has pinned and locked the buffer,
628-
* and will write the buffer afterwards.
628+
* and will write the buffer afterwards. Also, the given itemnos *must*
629+
* appear in increasing order in the array.
629630
*/
630631
void
631-
_bt_itemdel(Relationrel,Bufferbuf,ItemPointertid)
632+
_bt_delitems(Relationrel,Bufferbuf,
633+
OffsetNumber*itemnos,intnitems)
632634
{
633635
Pagepage=BufferGetPage(buf);
634-
OffsetNumberoffno;
635-
636-
offno=ItemPointerGetOffsetNumber(tid);
636+
inti;
637637

638+
/* No elog(ERROR) until changes are logged */
638639
START_CRIT_SECTION();
639640

640-
PageIndexTupleDelete(page,offno);
641+
/*
642+
* Delete the items in reverse order so we don't have to think about
643+
* adjusting item numbers for previous deletions.
644+
*/
645+
for (i=nitems-1;i >=0;i--)
646+
{
647+
PageIndexTupleDelete(page,itemnos[i]);
648+
}
641649

642650
/* XLOG stuff */
643651
if (!rel->rd_istemp)
@@ -646,17 +654,30 @@ _bt_itemdel(Relation rel, Buffer buf, ItemPointer tid)
646654
XLogRecPtrrecptr;
647655
XLogRecDatardata[2];
648656

649-
xlrec.target.node=rel->rd_node;
650-
xlrec.target.tid=*tid;
657+
xlrec.node=rel->rd_node;
658+
xlrec.block=BufferGetBlockNumber(buf);
651659

652660
rdata[0].buffer=InvalidBuffer;
653661
rdata[0].data= (char*)&xlrec;
654662
rdata[0].len=SizeOfBtreeDelete;
655663
rdata[0].next=&(rdata[1]);
656664

665+
/*
666+
* The target-offsets array is not in the buffer, but pretend
667+
* that it is. When XLogInsert stores the whole buffer, the offsets
668+
* array need not be stored too.
669+
*/
657670
rdata[1].buffer=buf;
658-
rdata[1].data=NULL;
659-
rdata[1].len=0;
671+
if (nitems>0)
672+
{
673+
rdata[1].data= (char*)itemnos;
674+
rdata[1].len=nitems*sizeof(OffsetNumber);
675+
}
676+
else
677+
{
678+
rdata[1].data=NULL;
679+
rdata[1].len=0;
680+
}
660681
rdata[1].next=NULL;
661682

662683
recptr=XLogInsert(RM_BTREE_ID,XLOG_BTREE_DELETE,rdata);

‎src/backend/access/nbtree/nbtree.c

Lines changed: 72 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
* Portions Copyright (c) 1994, Regents of the University of California
1313
*
1414
* IDENTIFICATION
15-
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.97 2003/02/2306:17:13 tgl Exp $
15+
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.98 2003/02/2322:43:08 tgl Exp $
1616
*
1717
*-------------------------------------------------------------------------
1818
*/
@@ -572,121 +572,110 @@ btbulkdelete(PG_FUNCTION_ARGS)
572572
IndexBulkDeleteCallbackcallback= (IndexBulkDeleteCallback)PG_GETARG_POINTER(1);
573573
void*callback_state= (void*)PG_GETARG_POINTER(2);
574574
IndexBulkDeleteResult*result;
575-
BlockNumbernum_pages;
576575
doubletuples_removed;
577576
doublenum_index_tuples;
578-
IndexScanDescscan;
579-
BTScanOpaqueso;
580-
ItemPointercurrent;
577+
OffsetNumberdeletable[BLCKSZ /sizeof(OffsetNumber)];
578+
intndeletable;
579+
Bufferbuf;
580+
BlockNumbernum_pages;
581581

582582
tuples_removed=0;
583583
num_index_tuples=0;
584584

585585
/*
586-
* We use a standard IndexScanDesc scan object, but to speed up the
587-
* loop, we skip most of the wrapper layers of index_getnext and
588-
* instead call _bt_step directly.This implies holding buffer lock
589-
* on a target page throughout the loop over the page's tuples.
590-
*
591-
* Whenever we step onto a new page, we have to trade in the read
592-
* lock acquired by _bt_first or _bt_step for an exclusive write lock
593-
* (fortunately, _bt_relbuf doesn't care which kind of lock it's
594-
* releasing when it comes time for _bt_step to release our lock).
586+
* The outer loop iterates over index leaf pages, the inner over items
587+
* on a leaf page. We issue just one _bt_delitems() call per page,
588+
* so as to minimize WAL traffic.
595589
*
596-
* Note that we exclusive-lock every leaf page, or at least every one
597-
*containing data items. It sounds attractive to only exclusive-lock
590+
* Note that we exclusive-lock every leaf page containing data items,
591+
*in sequence left to right. It sounds attractive to only exclusive-lock
598592
* those containing items we need to delete, but unfortunately that
599593
* is not safe: we could then pass a stopped indexscan, which could
600594
* in rare cases lead to deleting the item it needs to find when it
601595
* resumes. (See _bt_restscan --- this could only happen if an indexscan
602596
* stops on a deletable item and then a page split moves that item
603597
* into a page further to its right, which the indexscan will have no
604-
* pin on.)
598+
* pin on.) We can skip obtaining exclusive lock on empty pages
599+
* though, since no indexscan could be stopped on those.
605600
*/
606-
scan=index_beginscan(NULL,rel,SnapshotAny,0, (ScanKey)NULL);
607-
so= (BTScanOpaque)scan->opaque;
608-
current=&(scan->currentItemData);
609-
610-
/* Use _bt_first to get started, then _bt_step to remaining tuples */
611-
if (_bt_first(scan,ForwardScanDirection))
601+
buf=_bt_get_endpoint(rel,0, false);
602+
if (BufferIsValid(buf))/* check for empty index */
612603
{
613-
Bufferbuf;
614-
BlockNumberlockedBlock=InvalidBlockNumber;
615-
616-
/* we have the buffer pinned and read-locked */
617-
buf=so->btso_curbuf;
618-
Assert(BufferIsValid(buf));
619-
620-
do
604+
for (;;)
621605
{
622606
Pagepage;
623-
BlockNumberblkno;
624-
OffsetNumberoffnum;
625-
BTItembtitem;
626607
BTPageOpaqueopaque;
627-
IndexTupleitup;
628-
ItemPointerhtup;
608+
OffsetNumberoffnum,
609+
minoff,
610+
maxoff;
611+
BlockNumbernextpage;
629612

630613
CHECK_FOR_INTERRUPTS();
631614

632-
/* current is the next index tuple */
615+
ndeletable=0;
633616
page=BufferGetPage(buf);
634-
blkno=ItemPointerGetBlockNumber(current);
635-
636-
/*
637-
* Make sure we have a super-exclusive write lock on this page.
638-
*
639-
* We assume that only concurrent insertions, not deletions,
640-
* can occur while we're not holding the page lock (the
641-
* caller should hold a suitable relation lock to ensure
642-
* this). Therefore, no items can escape being scanned because
643-
* of this temporary lock release.
644-
*/
645-
if (blkno!=lockedBlock)
617+
opaque= (BTPageOpaque)PageGetSpecialPointer(page);
618+
minoff=P_FIRSTDATAKEY(opaque);
619+
maxoff=PageGetMaxOffsetNumber(page);
620+
/* We probably cannot see deleted pages, but skip 'em if so */
621+
if (minoff <=maxoff&& !P_ISDELETED(opaque))
646622
{
623+
/*
624+
* Trade in the initial read lock for a super-exclusive
625+
* write lock on this page.
626+
*/
647627
LockBuffer(buf,BUFFER_LOCK_UNLOCK);
648628
LockBufferForCleanup(buf);
649-
lockedBlock=blkno;
650629
/*
651-
* If the page was formerly rightmost but was split while we
652-
* didn't hold the lock, and ip_posid is pointing to item
653-
* 1, then ip_posid now points at the high key not a valid
654-
* data item. In this case we need to step forward.
630+
* Recompute minoff/maxoff, both of which could have changed
631+
* while we weren't holding the lock.
655632
*/
656-
opaque= (BTPageOpaque)PageGetSpecialPointer(page);
657-
if (current->ip_posid<P_FIRSTDATAKEY(opaque))
658-
current->ip_posid=P_FIRSTDATAKEY(opaque);
659-
}
660-
661-
offnum=ItemPointerGetOffsetNumber(current);
662-
btitem= (BTItem)PageGetItem(page,PageGetItemId(page,offnum));
663-
itup=&btitem->bti_itup;
664-
htup=&(itup->t_tid);
665-
666-
if (callback(htup,callback_state))
667-
{
668-
/* Okay to delete the item from the page */
669-
_bt_itemdel(rel,buf,current);
670-
671-
/* Mark buffer dirty, but keep the lock and pin */
672-
WriteNoReleaseBuffer(buf);
673-
674-
tuples_removed+=1;
675-
633+
minoff=P_FIRSTDATAKEY(opaque);
634+
maxoff=PageGetMaxOffsetNumber(page);
676635
/*
677-
* We now need to back up the scan one item, so that the next
678-
* cycle will re-examine the same offnum on this page (which
679-
* now holds the next item).
636+
* Scan over all items to see which ones need deleted
637+
* according to the callback function.
680638
*/
681-
current->ip_posid--;
639+
for (offnum=minoff;
640+
offnum <=maxoff;
641+
offnum=OffsetNumberNext(offnum))
642+
{
643+
BTItembtitem;
644+
ItemPointerhtup;
645+
646+
btitem= (BTItem)PageGetItem(page,
647+
PageGetItemId(page,offnum));
648+
htup=&(btitem->bti_itup.t_tid);
649+
if (callback(htup,callback_state))
650+
{
651+
deletable[ndeletable++]=offnum;
652+
tuples_removed+=1;
653+
}
654+
else
655+
num_index_tuples+=1;
656+
}
657+
}
658+
/*
659+
* If we need to delete anything, do it and write the buffer;
660+
* else just release the buffer.
661+
*/
662+
nextpage=opaque->btpo_next;
663+
if (ndeletable>0)
664+
{
665+
_bt_delitems(rel,buf,deletable,ndeletable);
666+
_bt_wrtbuf(rel,buf);
682667
}
683668
else
684-
num_index_tuples+=1;
685-
}while (_bt_step(scan,&buf,ForwardScanDirection));
669+
{
670+
_bt_relbuf(rel,buf);
671+
}
672+
/* And advance to next page, if any */
673+
if (nextpage==P_NONE)
674+
break;
675+
buf=_bt_getbuf(rel,nextpage,BT_READ);
676+
}
686677
}
687678

688-
index_endscan(scan);
689-
690679
/* return statistics */
691680
num_pages=RelationGetNumberOfBlocks(rel);
692681

@@ -765,7 +754,7 @@ btvacuumcleanup(PG_FUNCTION_ARGS)
765754
}
766755
}
767756
elseif ((opaque->btpo_flags&BTP_HALF_DEAD)||
768-
P_FIRSTDATAKEY(opaque)>PageGetMaxOffsetNumber(page))
757+
P_FIRSTDATAKEY(opaque)>PageGetMaxOffsetNumber(page))
769758
{
770759
/* Empty, try to delete */
771760
intndel;

‎src/backend/access/nbtree/nbtxlog.c

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
* Portions Copyright (c) 1994, Regents of the University of California
99
*
1010
* IDENTIFICATION
11-
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.2 2003/02/2306:17:13 tgl Exp $
11+
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.3 2003/02/2322:43:08 tgl Exp $
1212
*
1313
*-------------------------------------------------------------------------
1414
*/
@@ -379,11 +379,10 @@ btree_xlog_delete(bool redo, XLogRecPtr lsn, XLogRecord *record)
379379
return;
380380

381381
xlrec= (xl_btree_delete*)XLogRecGetData(record);
382-
reln=XLogOpenRelation(redo,RM_BTREE_ID,xlrec->target.node);
382+
reln=XLogOpenRelation(redo,RM_BTREE_ID,xlrec->node);
383383
if (!RelationIsValid(reln))
384384
return;
385-
buffer=XLogReadBuffer(false,reln,
386-
ItemPointerGetBlockNumber(&(xlrec->target.tid)));
385+
buffer=XLogReadBuffer(false,reln,xlrec->block);
387386
if (!BufferIsValid(buffer))
388387
elog(PANIC,"btree_delete_redo: block unfound");
389388
page= (Page)BufferGetPage(buffer);
@@ -396,7 +395,21 @@ btree_xlog_delete(bool redo, XLogRecPtr lsn, XLogRecord *record)
396395
return;
397396
}
398397

399-
PageIndexTupleDelete(page,ItemPointerGetOffsetNumber(&(xlrec->target.tid)));
398+
if (record->xl_len>SizeOfBtreeDelete)
399+
{
400+
OffsetNumber*unused;
401+
OffsetNumber*unend;
402+
403+
unused= (OffsetNumber*) ((char*)xlrec+SizeOfBtreeDelete);
404+
unend= (OffsetNumber*) ((char*)xlrec+record->xl_len);
405+
406+
/* be careful to delete from back to front */
407+
while (unused<unend)
408+
{
409+
unend--;
410+
PageIndexTupleDelete(page,*unend);
411+
}
412+
}
400413

401414
PageSetLSN(page,lsn);
402415
PageSetSUI(page,ThisStartUpID);
@@ -853,8 +866,8 @@ btree_desc(char *buf, uint8 xl_info, char *rec)
853866
{
854867
xl_btree_delete*xlrec= (xl_btree_delete*)rec;
855868

856-
strcat(buf,"delete:");
857-
out_target(buf,&(xlrec->target));
869+
sprintf(buf+strlen(buf),"delete:node %u/%u; blk %u",
870+
xlrec->node.tblNode,xlrec->node.relNode,xlrec->block);
858871
break;
859872
}
860873
caseXLOG_BTREE_DELETE_PAGE:

‎src/include/access/nbtree.h

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
* Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
88
* Portions Copyright (c) 1994, Regents of the University of California
99
*
10-
* $Id: nbtree.h,v 1.66 2003/02/2306:17:13 tgl Exp $
10+
* $Id: nbtree.h,v 1.67 2003/02/2322:43:09 tgl Exp $
1111
*
1212
*-------------------------------------------------------------------------
1313
*/
@@ -263,14 +263,18 @@ typedef struct xl_btree_split
263263
#defineSizeOfBtreeSplit(offsetof(xl_btree_split, leftlen) + sizeof(uint16))
264264

265265
/*
266-
* This is what we need to know about delete of an individual leaf btitem
266+
* This is what we need to know about delete of individual leaf btitems.
267+
* The WAL record can represent deletion of any number of btitems on a
268+
* single index page.
267269
*/
268270
typedefstructxl_btree_delete
269271
{
270-
xl_btreetidtarget;/* deleted tuple id */
272+
RelFileNodenode;
273+
BlockNumberblock;
274+
/* TARGET OFFSET NUMBERS FOLLOW AT THE END */
271275
}xl_btree_delete;
272276

273-
#defineSizeOfBtreeDelete(offsetof(xl_btreetid, tid) +SizeOfIptrData)
277+
#defineSizeOfBtreeDelete(offsetof(xl_btree_delete, block) +sizeof(BlockNumber))
274278

275279
/*
276280
* This is what we need to know about deletion of a btree page. The target
@@ -453,7 +457,8 @@ extern void _bt_wrtnorelbuf(Relation rel, Buffer buf);
453457
externvoid_bt_pageinit(Pagepage,Sizesize);
454458
externbool_bt_page_recyclable(Pagepage);
455459
externvoid_bt_metaproot(Relationrel,BlockNumberrootbknum,uint32level);
456-
externvoid_bt_itemdel(Relationrel,Bufferbuf,ItemPointertid);
460+
externvoid_bt_delitems(Relationrel,Bufferbuf,
461+
OffsetNumber*itemnos,intnitems);
457462
externint_bt_pagedel(Relationrel,Bufferbuf,boolvacuum_full);
458463

459464
/*

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp