88 *
99 *
1010 * IDENTIFICATION
11- * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.149 2007/02/06 14:55:11 tgl Exp $
11+ * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.150 2007/02/08 05:05:53 momjian Exp $
1212 *
1313 *-------------------------------------------------------------------------
1414 */
@@ -733,6 +733,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
733733rightoff ;
734734OffsetNumber maxoff ;
735735OffsetNumber i ;
736+ bool isroot ;
736737
737738rbuf = _bt_getbuf (rel ,P_NEW ,BT_WRITE );
738739origpage = BufferGetPage (buf );
@@ -747,6 +748,8 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
747748lopaque = (BTPageOpaque )PageGetSpecialPointer (leftpage );
748749ropaque = (BTPageOpaque )PageGetSpecialPointer (rightpage );
749750
751+ isroot = P_ISROOT (oopaque );
752+
750753/* if we're splitting this page, it won't be the root when we're done */
751754/* also, clear the SPLIT_END and HAS_GARBAGE flags in both pages */
752755lopaque -> btpo_flags = oopaque -> btpo_flags ;
@@ -921,61 +924,116 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
921924MarkBufferDirty (sbuf );
922925}
923926
927+ /*
928+ * By here, the original data page has been split into two new halves, and
929+ * these are correct. The algorithm requires that the left page never
930+ * move during a split, so we copy the new left page back on top of the
931+ * original. Note that this is not a waste of time, since we also require
932+ * (in the page management code) that the center of a page always be
933+ * clean, and the most efficient way to guarantee this is just to compact
934+ * the data by reinserting it into a new left page. (XXX the latter
935+ * comment is probably obsolete.)
936+ *
937+ * We need to do this before writing the WAL record, so that XLogInsert can
938+ * WAL log an image of the page if necessary.
939+ */
940+ PageRestoreTempPage (leftpage ,origpage );
941+
924942/* XLOG stuff */
925943if (!rel -> rd_istemp )
926944{
927945xl_btree_split xlrec ;
928946uint8 xlinfo ;
929947XLogRecPtr recptr ;
930- XLogRecData rdata [4 ];
948+ XLogRecData rdata [6 ];
949+ XLogRecData * lastrdata ;
931950
932- xlrec .target .node = rel -> rd_node ;
933- ItemPointerSet (& (xlrec .target .tid ),itup_blkno ,itup_off );
951+ xlrec .node = rel -> rd_node ;
952+ xlrec .leftsib = BufferGetBlockNumber (buf );
953+ xlrec .rightsib = BufferGetBlockNumber (rbuf );
954+ xlrec .firstright = firstright ;
955+ xlrec .rnext = ropaque -> btpo_next ;
956+ xlrec .level = lopaque -> btpo .level ;
957+
958+ rdata [0 ].data = (char * )& xlrec ;
959+ rdata [0 ].len = SizeOfBtreeSplit ;
960+ rdata [0 ].buffer = InvalidBuffer ;
961+
962+ lastrdata = & rdata [0 ];
963+
964+ /* Log downlink on non-leaf pages. */
965+ if (lopaque -> btpo .level > 0 )
966+ {
967+ lastrdata -> next = lastrdata + 1 ;
968+ lastrdata ++ ;
969+
970+ lastrdata -> data = (char * )& newitem -> t_tid .ip_blkid ;
971+ lastrdata -> len = sizeof (BlockIdData );
972+ lastrdata -> buffer = InvalidBuffer ;
973+ }
974+
975+ /* Log the new item, if it was inserted on the left page. If it was
976+ * put on the right page, we don't need to explicitly WAL log it
977+ * because it's included with all the other items on the right page.
978+ */
979+ lastrdata -> next = lastrdata + 1 ;
980+ lastrdata ++ ;
934981if (newitemonleft )
935- xlrec .otherblk = BufferGetBlockNumber (rbuf );
982+ {
983+ lastrdata -> data = (char * )& newitemoff ;
984+ lastrdata -> len = sizeof (OffsetNumber );
985+ lastrdata -> buffer = buf ;/* backup block 1 */
986+ lastrdata -> buffer_std = true;
987+
988+ lastrdata -> next = lastrdata + 1 ;
989+ lastrdata ++ ;
990+ lastrdata -> data = (char * )newitem ;
991+ lastrdata -> len = newitemsz ;
992+ lastrdata -> buffer = buf ;/* backup block 1 */
993+ lastrdata -> buffer_std = true;
994+ }
936995else
937- xlrec .otherblk = BufferGetBlockNumber (buf );
938- xlrec .leftblk = lopaque -> btpo_prev ;
939- xlrec .rightblk = ropaque -> btpo_next ;
940- xlrec .level = lopaque -> btpo .level ;
996+ {
997+ lastrdata -> data = NULL ;
998+ lastrdata -> len = 0 ;
999+ lastrdata -> buffer = buf ;/* backup block 1 */
1000+ lastrdata -> buffer_std = true;
1001+ }
9411002
942- /*
1003+ /* Log the contents of the right page in the format understood by
1004+ * _bt_restore_page(). We set lastrdata->buffer to InvalidBuffer,
1005+ * because we're going to recreate the whole page anyway.
1006+ *
9431007 * Direct access to page is not good but faster - we should implement
9441008 * some new func in page API. Note we only store the tuples
9451009 * themselves, knowing that the item pointers are in the same order
9461010 * and can be reconstructed by scanning the tuples. See comments for
9471011 * _bt_restore_page().
9481012 */
949- xlrec . leftlen = (( PageHeader ) leftpage ) -> pd_special -
950- (( PageHeader ) leftpage ) -> pd_upper ;
1013+ lastrdata -> next = lastrdata + 1 ;
1014+ lastrdata ++ ;
9511015
952- rdata [0 ].data = (char * )& xlrec ;
953- rdata [0 ].len = SizeOfBtreeSplit ;
954- rdata [0 ].buffer = InvalidBuffer ;
955- rdata [0 ].next = & (rdata [1 ]);
956-
957- rdata [1 ].data = (char * )leftpage + ((PageHeader )leftpage )-> pd_upper ;
958- rdata [1 ].len = xlrec .leftlen ;
959- rdata [1 ].buffer = InvalidBuffer ;
960- rdata [1 ].next = & (rdata [2 ]);
961-
962- rdata [2 ].data = (char * )rightpage + ((PageHeader )rightpage )-> pd_upper ;
963- rdata [2 ].len = ((PageHeader )rightpage )-> pd_special -
1016+ lastrdata -> data = (char * )rightpage +
9641017((PageHeader )rightpage )-> pd_upper ;
965- rdata [2 ].buffer = InvalidBuffer ;
966- rdata [2 ].next = NULL ;
1018+ lastrdata -> len = ((PageHeader )rightpage )-> pd_special -
1019+ ((PageHeader )rightpage )-> pd_upper ;
1020+ lastrdata -> buffer = InvalidBuffer ;
9671021
1022+ /* Log the right sibling, because we've changed it's prev-pointer. */
9681023if (!P_RIGHTMOST (ropaque ))
9691024{
970- rdata [2 ].next = & (rdata [3 ]);
971- rdata [3 ].data = NULL ;
972- rdata [3 ].len = 0 ;
973- rdata [3 ].buffer = sbuf ;
974- rdata [3 ].buffer_std = true;
975- rdata [3 ].next = NULL ;
1025+ lastrdata -> next = lastrdata + 1 ;
1026+ lastrdata ++ ;
1027+
1028+ lastrdata -> data = NULL ;
1029+ lastrdata -> len = 0 ;
1030+ lastrdata -> buffer = sbuf ;/* backup block 2 */
1031+ lastrdata -> buffer_std = true;
9761032}
9771033
978- if (P_ISROOT (oopaque ))
1034+ lastrdata -> next = NULL ;
1035+
1036+ if (isroot )
9791037xlinfo = newitemonleft ?XLOG_BTREE_SPLIT_L_ROOT :XLOG_BTREE_SPLIT_R_ROOT ;
9801038else
9811039xlinfo = newitemonleft ?XLOG_BTREE_SPLIT_L :XLOG_BTREE_SPLIT_R ;
@@ -993,24 +1051,6 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
9931051}
9941052}
9951053
996- /*
997- * By here, the original data page has been split into two new halves, and
998- * these are correct. The algorithm requires that the left page never
999- * move during a split, so we copy the new left page back on top of the
1000- * original. Note that this is not a waste of time, since we also require
1001- * (in the page management code) that the center of a page always be
1002- * clean, and the most efficient way to guarantee this is just to compact
1003- * the data by reinserting it into a new left page. (XXX the latter
1004- * comment is probably obsolete.)
1005- *
1006- * It's a bit weird that we don't fill in the left page till after writing
1007- * the XLOG entry, but not really worth changing. Note that we use the
1008- * origpage data (specifically its BTP_ROOT bit) while preparing the XLOG
1009- * entry, so simply reshuffling the code won't do.
1010- */
1011-
1012- PageRestoreTempPage (leftpage ,origpage );
1013-
10141054END_CRIT_SECTION ();
10151055
10161056/* release the old right sibling */