88 *
99 *
1010 * IDENTIFICATION
11- * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.155 2007/03/25 19:45:14 tgl Exp $
11+ * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.156 2007/04/11 20:47:37 tgl Exp $
1212 *
1313 *-------------------------------------------------------------------------
1414 */
@@ -49,7 +49,7 @@ static TransactionId _bt_check_unique(Relation rel, IndexTuple itup,
4949Relation heapRel ,Buffer buf ,OffsetNumber ioffset ,
5050ScanKey itup_scankey );
5151static void _bt_findinsertloc (Relation rel ,
52- Buffer * bufptr ,
52+ Buffer * bufptr ,
5353OffsetNumber * offsetptr ,
5454int keysz ,
5555ScanKey scankey ,
@@ -66,7 +66,7 @@ static OffsetNumber _bt_findsplitloc(Relation rel, Page page,
6666OffsetNumber newitemoff ,
6767Size newitemsz ,
6868bool * newitemonleft );
69- static void _bt_checksplitloc (FindSplitData * state ,
69+ static void _bt_checksplitloc (FindSplitData * state ,
7070OffsetNumber firstoldonright ,bool newitemonleft ,
7171int dataitemstoleft ,Size firstoldonrightsz );
7272static void _bt_pgaddtup (Relation rel ,Page page ,
@@ -459,7 +459,7 @@ _bt_findinsertloc(Relation rel,
459459 * the hint supplied by the caller invalid */
460460vacuumed = true;
461461
462- if (PageGetFreeSpace (page ) >=itemsz )
462+ if (PageGetFreeSpace (page ) >=itemsz )
463463break ;/* OK, now we have enough space */
464464}
465465
@@ -506,7 +506,7 @@ _bt_findinsertloc(Relation rel,
506506 * moved right at all, we know we should insert at the start of the
507507 * page. If we didn't move right, we can use the firstlegaloff hint
508508 * if the caller supplied one, unless we vacuumed the page which
509- * might have moved tuples around making the hint invalid. If we
509+ * might have moved tuples around making the hint invalid. If we
510510 * didn't move right or can't use the hint, find the position
511511 * by searching.
512512 */
@@ -779,8 +779,6 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
779779Buffer sbuf = InvalidBuffer ;
780780Page spage = NULL ;
781781BTPageOpaque sopaque = NULL ;
782- OffsetNumber itup_off = 0 ;
783- BlockNumber itup_blkno = 0 ;
784782Size itemsz ;
785783ItemId itemid ;
786784IndexTuple item ;
@@ -798,6 +796,14 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
798796_bt_pageinit (leftpage ,BufferGetPageSize (buf ));
799797/* rightpage was already initialized by _bt_getbuf */
800798
799+ /*
800+ * Copy the original page's LSN and TLI into leftpage, which will become
801+ * the updated version of the page. We need this because XLogInsert will
802+ * examine these fields and possibly dump them in a page image.
803+ */
804+ PageSetLSN (leftpage ,PageGetLSN (origpage ));
805+ PageSetTLI (leftpage ,PageGetTLI (origpage ));
806+
801807/* init btree private data */
802808oopaque = (BTPageOpaque )PageGetSpecialPointer (origpage );
803809lopaque = (BTPageOpaque )PageGetSpecialPointer (leftpage );
@@ -864,7 +870,10 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
864870leftoff = OffsetNumberNext (leftoff );
865871
866872/*
867- * Now transfer all the data items to the appropriate page
873+ * Now transfer all the data items to the appropriate page.
874+ *
875+ * Note: we *must* insert at least the right page's items in item-number
876+ * order, for the benefit of _bt_restore_page().
868877 */
869878maxoff = PageGetMaxOffsetNumber (origpage );
870879
@@ -881,16 +890,12 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
881890{
882891_bt_pgaddtup (rel ,leftpage ,newitemsz ,newitem ,leftoff ,
883892"left sibling" );
884- itup_off = leftoff ;
885- itup_blkno = BufferGetBlockNumber (buf );
886893leftoff = OffsetNumberNext (leftoff );
887894}
888895else
889896{
890897_bt_pgaddtup (rel ,rightpage ,newitemsz ,newitem ,rightoff ,
891898"right sibling" );
892- itup_off = rightoff ;
893- itup_blkno = BufferGetBlockNumber (rbuf );
894899rightoff = OffsetNumberNext (rightoff );
895900}
896901}
@@ -921,8 +926,6 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
921926Assert (!newitemonleft );
922927_bt_pgaddtup (rel ,rightpage ,newitemsz ,newitem ,rightoff ,
923928"right sibling" );
924- itup_off = rightoff ;
925- itup_blkno = BufferGetBlockNumber (rbuf );
926929rightoff = OffsetNumberNext (rightoff );
927930}
928931
@@ -961,7 +964,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
961964
962965/*
963966 * Right sibling is locked, new siblings are prepared, but original page
964- * is not updated yet. Log changes before continuing.
967+ * is not updated yet.
965968 *
966969 * NO EREPORT(ERROR) till right sibling is updated. We can get away with
967970 * not starting the critical section till here because we haven't been
@@ -970,15 +973,6 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
970973 */
971974START_CRIT_SECTION ();
972975
973- MarkBufferDirty (buf );
974- MarkBufferDirty (rbuf );
975-
976- if (!P_RIGHTMOST (ropaque ))
977- {
978- sopaque -> btpo_prev = BufferGetBlockNumber (rbuf );
979- MarkBufferDirty (sbuf );
980- }
981-
982976/*
983977 * By here, the original data page has been split into two new halves, and
984978 * these are correct. The algorithm requires that the left page never
@@ -994,6 +988,15 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
994988 */
995989PageRestoreTempPage (leftpage ,origpage );
996990
991+ MarkBufferDirty (buf );
992+ MarkBufferDirty (rbuf );
993+
994+ if (!P_RIGHTMOST (ropaque ))
995+ {
996+ sopaque -> btpo_prev = BufferGetBlockNumber (rbuf );
997+ MarkBufferDirty (sbuf );
998+ }
999+
9971000/* XLOG stuff */
9981001if (!rel -> rd_istemp )
9991002{
@@ -1006,9 +1009,9 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
10061009xlrec .node = rel -> rd_node ;
10071010xlrec .leftsib = BufferGetBlockNumber (buf );
10081011xlrec .rightsib = BufferGetBlockNumber (rbuf );
1009- xlrec .firstright = firstright ;
10101012xlrec .rnext = ropaque -> btpo_next ;
10111013xlrec .level = ropaque -> btpo .level ;
1014+ xlrec .firstright = firstright ;
10121015
10131016rdata [0 ].data = (char * )& xlrec ;
10141017rdata [0 ].len = SizeOfBtreeSplit ;
@@ -1027,54 +1030,68 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
10271030lastrdata -> buffer = InvalidBuffer ;
10281031}
10291032
1030- /* Log the new item, if it was inserted on the left page. If it was
1031- * put on the right page, we don't need to explicitly WAL log it
1032- * because it's included with all the other items on the right page.
1033+ /*
1034+ * Log the new item and its offset, if it was inserted on the left
1035+ * page. (If it was put on the right page, we don't need to explicitly
1036+ * WAL log it because it's included with all the other items on the
1037+ * right page.) Show these as belonging to the left page buffer,
1038+ * so that they are not stored if XLogInsert decides it needs a
1039+ * full-page image of the left page.
10331040 */
1034- lastrdata -> next = lastrdata + 1 ;
1035- lastrdata ++ ;
10361041if (newitemonleft )
10371042{
1043+ lastrdata -> next = lastrdata + 1 ;
1044+ lastrdata ++ ;
10381045lastrdata -> data = (char * )& newitemoff ;
10391046lastrdata -> len = sizeof (OffsetNumber );
10401047lastrdata -> buffer = buf ;/* backup block 1 */
10411048lastrdata -> buffer_std = true;
10421049
10431050lastrdata -> next = lastrdata + 1 ;
10441051lastrdata ++ ;
1045- lastrdata -> data = (char * )newitem ;
1046- lastrdata -> len = newitemsz ;
1052+ lastrdata -> data = (char * )newitem ;
1053+ lastrdata -> len = MAXALIGN ( newitemsz ) ;
10471054lastrdata -> buffer = buf ;/* backup block 1 */
10481055lastrdata -> buffer_std = true;
10491056}
10501057else
10511058{
1059+ /*
1060+ * Although we don't need to WAL-log the new item, we still
1061+ * need XLogInsert to consider storing a full-page image of the
1062+ * left page, so make an empty entry referencing that buffer.
1063+ * This also ensures that the left page is always backup block 1.
1064+ */
1065+ lastrdata -> next = lastrdata + 1 ;
1066+ lastrdata ++ ;
10521067lastrdata -> data = NULL ;
10531068lastrdata -> len = 0 ;
10541069lastrdata -> buffer = buf ;/* backup block 1 */
10551070lastrdata -> buffer_std = true;
10561071}
10571072
1058- /* Log the contents of the right page in the format understood by
1073+ /*
1074+ * Log the contents of the right page in the format understood by
10591075 * _bt_restore_page(). We set lastrdata->buffer to InvalidBuffer,
1060- * because we're going to recreate the whole page anyway.
1076+ * because we're going to recreate the whole page anyway, so it
1077+ * should never be stored by XLogInsert.
10611078 *
10621079 * Direct access to page is not good but faster - we should implement
10631080 * some new func in page API. Note we only store the tuples
1064- * themselves, knowing thatthe item pointers are inthe same order
1065- * andcan be reconstructed by scanning the tuples . See comments for
1081+ * themselves, knowing thatthey were inserted initem-number order
1082+ * andso the item pointers can be reconstructed . See comments for
10661083 * _bt_restore_page().
10671084 */
10681085lastrdata -> next = lastrdata + 1 ;
10691086lastrdata ++ ;
10701087
1071- lastrdata -> data = (char * )rightpage +
1088+ lastrdata -> data = (char * )rightpage +
10721089((PageHeader )rightpage )-> pd_upper ;
10731090lastrdata -> len = ((PageHeader )rightpage )-> pd_special -
10741091((PageHeader )rightpage )-> pd_upper ;
10751092lastrdata -> buffer = InvalidBuffer ;
10761093
1077- /* Log the right sibling, because we've changedit's prev-pointer. */
1094+ /* Log the right sibling, because we've changedits' prev-pointer. */
10781095if (!P_RIGHTMOST (ropaque ))
10791096{
10801097lastrdata -> next = lastrdata + 1 ;
@@ -1216,7 +1233,7 @@ _bt_findsplitloc(Relation rel,
12161233olddataitemstoleft = 0 ;
12171234goodenoughfound = false;
12181235maxoff = PageGetMaxOffsetNumber (page );
1219-
1236+
12201237for (offnum = P_FIRSTDATAKEY (opaque );
12211238offnum <=maxoff ;
12221239offnum = OffsetNumberNext (offnum ))
@@ -1234,7 +1251,7 @@ _bt_findsplitloc(Relation rel,
12341251olddataitemstoleft ,itemsz );
12351252
12361253else if (offnum < newitemoff )
1237- _bt_checksplitloc (& state ,offnum , false,
1254+ _bt_checksplitloc (& state ,offnum , false,
12381255olddataitemstoleft ,itemsz );
12391256else
12401257{
@@ -1285,11 +1302,11 @@ _bt_findsplitloc(Relation rel,
12851302 * items go to the left page and only the new item goes to the right page.
12861303 * In that case, firstoldonrightsz is not used.
12871304 *
1288- * olddataitemstoleft is the total size of all old items to the left of
1289- * firstoldonright.
1305+ * olddataitemstoleft is the total size of all old items to the left of
1306+ * firstoldonright.
12901307 */
12911308static void
1292- _bt_checksplitloc (FindSplitData * state ,
1309+ _bt_checksplitloc (FindSplitData * state ,
12931310OffsetNumber firstoldonright ,
12941311bool newitemonleft ,
12951312int olddataitemstoleft ,
@@ -1311,7 +1328,7 @@ _bt_checksplitloc(FindSplitData *state,
13111328
13121329/* Account for all the old tuples */
13131330leftfree = state -> leftspace - olddataitemstoleft ;
1314- rightfree = state -> rightspace -
1331+ rightfree = state -> rightspace -
13151332(state -> olddataitemstotal - olddataitemstoleft );
13161333
13171334/*
@@ -1854,7 +1871,7 @@ _bt_vacuum_one_page(Relation rel, Buffer buffer)
18541871BTPageOpaque opaque = (BTPageOpaque )PageGetSpecialPointer (page );
18551872
18561873/*
1857- * Scan over all items to see which ones need to be deleted
1874+ * Scan over all items to see which ones need to be deleted
18581875 * according to LP_DELETE flags.
18591876 */
18601877minoff = P_FIRSTDATAKEY (opaque );