Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitb79575c

Browse files
committed
Reduce WAL activity for page splits:
> Currently, an index split writes all the data on the split page to> WAL. That's a lot of WAL traffic. The tuples that are copied to the> right page need to be WAL logged, but the tuples that stay on the> original page don't.Heikki Linnakangas
1 parentfe03a5f commitb79575c

File tree

3 files changed

+260
-164
lines changed

3 files changed

+260
-164
lines changed

‎src/backend/access/nbtree/nbtinsert.c

Lines changed: 91 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.149 2007/02/06 14:55:11 tgl Exp $
11+
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.150 2007/02/08 05:05:53 momjian Exp $
1212
*
1313
*-------------------------------------------------------------------------
1414
*/
@@ -733,6 +733,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
733733
rightoff;
734734
OffsetNumbermaxoff;
735735
OffsetNumberi;
736+
boolisroot;
736737

737738
rbuf=_bt_getbuf(rel,P_NEW,BT_WRITE);
738739
origpage=BufferGetPage(buf);
@@ -747,6 +748,8 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
747748
lopaque= (BTPageOpaque)PageGetSpecialPointer(leftpage);
748749
ropaque= (BTPageOpaque)PageGetSpecialPointer(rightpage);
749750

751+
isroot=P_ISROOT(oopaque);
752+
750753
/* if we're splitting this page, it won't be the root when we're done */
751754
/* also, clear the SPLIT_END and HAS_GARBAGE flags in both pages */
752755
lopaque->btpo_flags=oopaque->btpo_flags;
@@ -921,61 +924,116 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
921924
MarkBufferDirty(sbuf);
922925
}
923926

927+
/*
928+
* By here, the original data page has been split into two new halves, and
929+
* these are correct. The algorithm requires that the left page never
930+
* move during a split, so we copy the new left page back on top of the
931+
* original. Note that this is not a waste of time, since we also require
932+
* (in the page management code) that the center of a page always be
933+
* clean, and the most efficient way to guarantee this is just to compact
934+
* the data by reinserting it into a new left page. (XXX the latter
935+
* comment is probably obsolete.)
936+
*
937+
* We need to do this before writing the WAL record, so that XLogInsert can
938+
* WAL log an image of the page if necessary.
939+
*/
940+
PageRestoreTempPage(leftpage,origpage);
941+
924942
/* XLOG stuff */
925943
if (!rel->rd_istemp)
926944
{
927945
xl_btree_splitxlrec;
928946
uint8xlinfo;
929947
XLogRecPtrrecptr;
930-
XLogRecDatardata[4];
948+
XLogRecDatardata[6];
949+
XLogRecData*lastrdata;
931950

932-
xlrec.target.node=rel->rd_node;
933-
ItemPointerSet(&(xlrec.target.tid),itup_blkno,itup_off);
951+
xlrec.node=rel->rd_node;
952+
xlrec.leftsib=BufferGetBlockNumber(buf);
953+
xlrec.rightsib=BufferGetBlockNumber(rbuf);
954+
xlrec.firstright=firstright;
955+
xlrec.rnext=ropaque->btpo_next;
956+
xlrec.level=lopaque->btpo.level;
957+
958+
rdata[0].data= (char*)&xlrec;
959+
rdata[0].len=SizeOfBtreeSplit;
960+
rdata[0].buffer=InvalidBuffer;
961+
962+
lastrdata=&rdata[0];
963+
964+
/* Log downlink on non-leaf pages. */
965+
if (lopaque->btpo.level>0)
966+
{
967+
lastrdata->next=lastrdata+1;
968+
lastrdata++;
969+
970+
lastrdata->data= (char*)&newitem->t_tid.ip_blkid;
971+
lastrdata->len=sizeof(BlockIdData);
972+
lastrdata->buffer=InvalidBuffer;
973+
}
974+
975+
/* Log the new item, if it was inserted on the left page. If it was
976+
* put on the right page, we don't need to explicitly WAL log it
977+
* because it's included with all the other items on the right page.
978+
*/
979+
lastrdata->next=lastrdata+1;
980+
lastrdata++;
934981
if (newitemonleft)
935-
xlrec.otherblk=BufferGetBlockNumber(rbuf);
982+
{
983+
lastrdata->data= (char*)&newitemoff;
984+
lastrdata->len=sizeof(OffsetNumber);
985+
lastrdata->buffer=buf;/* backup block 1 */
986+
lastrdata->buffer_std= true;
987+
988+
lastrdata->next=lastrdata+1;
989+
lastrdata++;
990+
lastrdata->data= (char*)newitem;
991+
lastrdata->len=newitemsz;
992+
lastrdata->buffer=buf;/* backup block 1 */
993+
lastrdata->buffer_std= true;
994+
}
936995
else
937-
xlrec.otherblk=BufferGetBlockNumber(buf);
938-
xlrec.leftblk=lopaque->btpo_prev;
939-
xlrec.rightblk=ropaque->btpo_next;
940-
xlrec.level=lopaque->btpo.level;
996+
{
997+
lastrdata->data=NULL;
998+
lastrdata->len=0;
999+
lastrdata->buffer=buf;/* backup block 1 */
1000+
lastrdata->buffer_std= true;
1001+
}
9411002

942-
/*
1003+
/* Log the contents of the right page in the format understood by
1004+
* _bt_restore_page(). We set lastrdata->buffer to InvalidBuffer,
1005+
* because we're going to recreate the whole page anyway.
1006+
*
9431007
* Direct access to page is not good but faster - we should implement
9441008
* some new func in page API. Note we only store the tuples
9451009
* themselves, knowing that the item pointers are in the same order
9461010
* and can be reconstructed by scanning the tuples. See comments for
9471011
* _bt_restore_page().
9481012
*/
949-
xlrec.leftlen=((PageHeader)leftpage)->pd_special-
950-
((PageHeader)leftpage)->pd_upper;
1013+
lastrdata->next=lastrdata+1;
1014+
lastrdata++;
9511015

952-
rdata[0].data= (char*)&xlrec;
953-
rdata[0].len=SizeOfBtreeSplit;
954-
rdata[0].buffer=InvalidBuffer;
955-
rdata[0].next=&(rdata[1]);
956-
957-
rdata[1].data= (char*)leftpage+ ((PageHeader)leftpage)->pd_upper;
958-
rdata[1].len=xlrec.leftlen;
959-
rdata[1].buffer=InvalidBuffer;
960-
rdata[1].next=&(rdata[2]);
961-
962-
rdata[2].data= (char*)rightpage+ ((PageHeader)rightpage)->pd_upper;
963-
rdata[2].len= ((PageHeader)rightpage)->pd_special-
1016+
lastrdata->data= (char*)rightpage+
9641017
((PageHeader)rightpage)->pd_upper;
965-
rdata[2].buffer=InvalidBuffer;
966-
rdata[2].next=NULL;
1018+
lastrdata->len= ((PageHeader)rightpage)->pd_special-
1019+
((PageHeader)rightpage)->pd_upper;
1020+
lastrdata->buffer=InvalidBuffer;
9671021

1022+
/* Log the right sibling, because we've changed it's prev-pointer. */
9681023
if (!P_RIGHTMOST(ropaque))
9691024
{
970-
rdata[2].next=&(rdata[3]);
971-
rdata[3].data=NULL;
972-
rdata[3].len=0;
973-
rdata[3].buffer=sbuf;
974-
rdata[3].buffer_std= true;
975-
rdata[3].next=NULL;
1025+
lastrdata->next=lastrdata+1;
1026+
lastrdata++;
1027+
1028+
lastrdata->data=NULL;
1029+
lastrdata->len=0;
1030+
lastrdata->buffer=sbuf;/* backup block 2 */
1031+
lastrdata->buffer_std= true;
9761032
}
9771033

978-
if (P_ISROOT(oopaque))
1034+
lastrdata->next=NULL;
1035+
1036+
if (isroot)
9791037
xlinfo=newitemonleft ?XLOG_BTREE_SPLIT_L_ROOT :XLOG_BTREE_SPLIT_R_ROOT;
9801038
else
9811039
xlinfo=newitemonleft ?XLOG_BTREE_SPLIT_L :XLOG_BTREE_SPLIT_R;
@@ -993,24 +1051,6 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
9931051
}
9941052
}
9951053

996-
/*
997-
* By here, the original data page has been split into two new halves, and
998-
* these are correct. The algorithm requires that the left page never
999-
* move during a split, so we copy the new left page back on top of the
1000-
* original. Note that this is not a waste of time, since we also require
1001-
* (in the page management code) that the center of a page always be
1002-
* clean, and the most efficient way to guarantee this is just to compact
1003-
* the data by reinserting it into a new left page. (XXX the latter
1004-
* comment is probably obsolete.)
1005-
*
1006-
* It's a bit weird that we don't fill in the left page till after writing
1007-
* the XLOG entry, but not really worth changing. Note that we use the
1008-
* origpage data (specifically its BTP_ROOT bit) while preparing the XLOG
1009-
* entry, so simply reshuffling the code won't do.
1010-
*/
1011-
1012-
PageRestoreTempPage(leftpage,origpage);
1013-
10141054
END_CRIT_SECTION();
10151055

10161056
/* release the old right sibling */

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp