88 *
99 *
1010 * IDENTIFICATION
11- * $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.65 2007/04/09 22:03:57 tgl Exp $
11+ * $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.66 2007/04/19 20:24:04 tgl Exp $
1212 *
1313 * NOTES
1414 * Postgres hash pages look like ordinary relation pages. The opaque
3636#include "utils/lsyscache.h"
3737
3838
39- static BlockNumber _hash_alloc_buckets (Relation rel ,uint32 nblocks );
39+ static bool _hash_alloc_buckets (Relation rel ,BlockNumber firstblock ,
40+ uint32 nblocks );
4041static void _hash_splitbucket (Relation rel ,Buffer metabuf ,
4142Bucket obucket ,Bucket nbucket ,
4243BlockNumber start_oblkno ,
@@ -104,8 +105,9 @@ _hash_droplock(Relation rel, BlockNumber whichlock, int access)
104105 *requested buffer and its reference count has been incremented
105106 *(ie, the buffer is "locked and pinned").
106107 *
107- *blkno == P_NEW is allowed, but it is caller's responsibility to
108- *ensure that only one process can extend the index at a time.
108+ *P_NEW is disallowed because this routine should only be used
109+ *to access pages that are known to be before the filesystem EOF.
110+ *Extending the index should be done with _hash_getnewbuf.
109111 *
110112 *All call sites should call either _hash_checkpage or _hash_pageinit
111113 *on the returned page, depending on whether the block is expected
@@ -116,6 +118,9 @@ _hash_getbuf(Relation rel, BlockNumber blkno, int access)
116118{
117119Buffer buf ;
118120
121+ if (blkno == P_NEW )
122+ elog (ERROR ,"hash AM does not use P_NEW" );
123+
119124buf = ReadBuffer (rel ,blkno );
120125
121126if (access != HASH_NOLOCK )
@@ -125,6 +130,51 @@ _hash_getbuf(Relation rel, BlockNumber blkno, int access)
125130return buf ;
126131}
127132
133+ /*
134+ *_hash_getnewbuf() -- Get a new page at the end of the index.
135+ *
136+ *This has the same API as _hash_getbuf, except that we are adding
137+ *a page to the index, and hence expect the page to be past the
138+ *logical EOF. (However, we have to support the case where it isn't,
139+ *since a prior try might have crashed after extending the filesystem
140+ *EOF but before updating the metapage to reflect the added page.)
141+ *
142+ *It is caller's responsibility to ensure that only one process can
143+ *extend the index at a time.
144+ *
145+ *All call sites should call _hash_pageinit on the returned page.
146+ *Also, it's difficult to imagine why access would not be HASH_WRITE.
147+ */
148+ Buffer
149+ _hash_getnewbuf (Relation rel ,BlockNumber blkno ,int access )
150+ {
151+ BlockNumber nblocks = RelationGetNumberOfBlocks (rel );
152+ Buffer buf ;
153+
154+ if (blkno == P_NEW )
155+ elog (ERROR ,"hash AM does not use P_NEW" );
156+ if (blkno > nblocks )
157+ elog (ERROR ,"access to noncontiguous page in hash index \"%s\"" ,
158+ RelationGetRelationName (rel ));
159+
160+ /* smgr insists we use P_NEW to extend the relation */
161+ if (blkno == nblocks )
162+ {
163+ buf = ReadBuffer (rel ,P_NEW );
164+ if (BufferGetBlockNumber (buf )!= blkno )
165+ elog (ERROR ,"unexpected hash relation size: %u, should be %u" ,
166+ BufferGetBlockNumber (buf ),blkno );
167+ }
168+ else
169+ buf = ReadBuffer (rel ,blkno );
170+
171+ if (access != HASH_NOLOCK )
172+ LockBuffer (buf ,access );
173+
174+ /* ref count and lock type are correct */
175+ return buf ;
176+ }
177+
128178/*
129179 *_hash_relbuf() -- release a locked buffer.
130180 *
@@ -238,12 +288,11 @@ _hash_metapinit(Relation rel)
238288
239289/*
240290 * We initialize the metapage, the first two bucket pages, and the
241- * first bitmap page in sequence, usingP_NEW to cause smgrextend()
242- * calls to occur. This ensures that the smgr level has the right
243- * idea of the physical index length.
291+ * first bitmap page in sequence, using_hash_getnewbuf to cause
292+ *smgrextend() calls to occur. This ensures that the smgr level
293+ *has the right idea of the physical index length.
244294 */
245- metabuf = _hash_getbuf (rel ,P_NEW ,HASH_WRITE );
246- Assert (BufferGetBlockNumber (metabuf )== HASH_METAPAGE );
295+ metabuf = _hash_getnewbuf (rel ,HASH_METAPAGE ,HASH_WRITE );
247296pg = BufferGetPage (metabuf );
248297_hash_pageinit (pg ,BufferGetPageSize (metabuf ));
249298
@@ -301,8 +350,7 @@ _hash_metapinit(Relation rel)
301350 */
302351for (i = 0 ;i <=1 ;i ++ )
303352{
304- buf = _hash_getbuf (rel ,P_NEW ,HASH_WRITE );
305- Assert (BufferGetBlockNumber (buf )== BUCKET_TO_BLKNO (metap ,i ));
353+ buf = _hash_getnewbuf (rel ,BUCKET_TO_BLKNO (metap ,i ),HASH_WRITE );
306354pg = BufferGetPage (buf );
307355_hash_pageinit (pg ,BufferGetPageSize (buf ));
308356pageopaque = (HashPageOpaque )PageGetSpecialPointer (pg );
@@ -350,7 +398,6 @@ _hash_expandtable(Relation rel, Buffer metabuf)
350398Bucket old_bucket ;
351399Bucket new_bucket ;
352400uint32 spare_ndx ;
353- BlockNumber firstblock = InvalidBlockNumber ;
354401BlockNumber start_oblkno ;
355402BlockNumber start_nblkno ;
356403uint32 maxbucket ;
@@ -402,39 +449,15 @@ _hash_expandtable(Relation rel, Buffer metabuf)
402449if (metap -> hashm_maxbucket >= (uint32 )0x7FFFFFFE )
403450gotofail ;
404451
405- /*
406- * If the split point is increasing (hashm_maxbucket's log base 2
407- * increases), we need to allocate a new batch of bucket pages.
408- */
409- new_bucket = metap -> hashm_maxbucket + 1 ;
410- spare_ndx = _hash_log2 (new_bucket + 1 );
411- if (spare_ndx > metap -> hashm_ovflpoint )
412- {
413- Assert (spare_ndx == metap -> hashm_ovflpoint + 1 );
414- /*
415- * The number of buckets in the new splitpoint is equal to the
416- * total number already in existence, i.e. new_bucket. Currently
417- * this maps one-to-one to blocks required, but someday we may need
418- * a more complicated calculation here.
419- */
420- firstblock = _hash_alloc_buckets (rel ,new_bucket );
421- if (firstblock == InvalidBlockNumber )
422- gotofail ;/* can't split due to BlockNumber overflow */
423- }
424-
425452/*
426453 * Determine which bucket is to be split, and attempt to lock the old
427454 * bucket.If we can't get the lock, give up.
428455 *
429456 * The lock protects us against other backends, but not against our own
430457 * backend. Must check for active scans separately.
431- *
432- * Ideally we would lock the new bucket too before proceeding, but if we
433- * are about to cross a splitpoint then the BUCKET_TO_BLKNO mapping isn't
434- * correct yet. For simplicity we update the metapage first and then
435- * lock. This should be okay because no one else should be trying to lock
436- * the new bucket yet...
437458 */
459+ new_bucket = metap -> hashm_maxbucket + 1 ;
460+
438461old_bucket = (new_bucket & metap -> hashm_lowmask );
439462
440463start_oblkno = BUCKET_TO_BLKNO (metap ,old_bucket );
@@ -445,6 +468,45 @@ _hash_expandtable(Relation rel, Buffer metabuf)
445468if (!_hash_try_getlock (rel ,start_oblkno ,HASH_EXCLUSIVE ))
446469gotofail ;
447470
471+ /*
472+ * Likewise lock the new bucket (should never fail).
473+ *
474+ * Note: it is safe to compute the new bucket's blkno here, even though
475+ * we may still need to update the BUCKET_TO_BLKNO mapping. This is
476+ * because the current value of hashm_spares[hashm_ovflpoint] correctly
477+ * shows where we are going to put a new splitpoint's worth of buckets.
478+ */
479+ start_nblkno = BUCKET_TO_BLKNO (metap ,new_bucket );
480+
481+ if (_hash_has_active_scan (rel ,new_bucket ))
482+ elog (ERROR ,"scan in progress on supposedly new bucket" );
483+
484+ if (!_hash_try_getlock (rel ,start_nblkno ,HASH_EXCLUSIVE ))
485+ elog (ERROR ,"could not get lock on supposedly new bucket" );
486+
487+ /*
488+ * If the split point is increasing (hashm_maxbucket's log base 2
489+ * increases), we need to allocate a new batch of bucket pages.
490+ */
491+ spare_ndx = _hash_log2 (new_bucket + 1 );
492+ if (spare_ndx > metap -> hashm_ovflpoint )
493+ {
494+ Assert (spare_ndx == metap -> hashm_ovflpoint + 1 );
495+ /*
496+ * The number of buckets in the new splitpoint is equal to the
497+ * total number already in existence, i.e. new_bucket. Currently
498+ * this maps one-to-one to blocks required, but someday we may need
499+ * a more complicated calculation here.
500+ */
501+ if (!_hash_alloc_buckets (rel ,start_nblkno ,new_bucket ))
502+ {
503+ /* can't split due to BlockNumber overflow */
504+ _hash_droplock (rel ,start_oblkno ,HASH_EXCLUSIVE );
505+ _hash_droplock (rel ,start_nblkno ,HASH_EXCLUSIVE );
506+ gotofail ;
507+ }
508+ }
509+
448510/*
449511 * Okay to proceed with split.Update the metapage bucket mapping info.
450512 *
@@ -477,20 +539,6 @@ _hash_expandtable(Relation rel, Buffer metabuf)
477539metap -> hashm_ovflpoint = spare_ndx ;
478540}
479541
480- /* now we can compute the new bucket's primary block number */
481- start_nblkno = BUCKET_TO_BLKNO (metap ,new_bucket );
482-
483- /* if we added a splitpoint, should match result of _hash_alloc_buckets */
484- if (firstblock != InvalidBlockNumber &&
485- firstblock != start_nblkno )
486- elog (PANIC ,"unexpected hash relation size: %u, should be %u" ,
487- firstblock ,start_nblkno );
488-
489- Assert (!_hash_has_active_scan (rel ,new_bucket ));
490-
491- if (!_hash_try_getlock (rel ,start_nblkno ,HASH_EXCLUSIVE ))
492- elog (PANIC ,"could not get lock on supposedly new bucket" );
493-
494542/* Done mucking with metapage */
495543END_CRIT_SECTION ();
496544
@@ -539,7 +587,7 @@ _hash_expandtable(Relation rel, Buffer metabuf)
539587 * This does not need to initialize the new bucket pages; we'll do that as
540588 * each one is used by _hash_expandtable(). But we have to extend the logical
541589 * EOF to the end of the splitpoint; this keeps smgr's idea of the EOF in
542- * sync with ours, so thatoverflow-page allocation works correctly .
590+ * sync with ours, so thatwe don't get complaints from smgr .
543591 *
544592 * We do this by writing a page of zeroes at the end of the splitpoint range.
545593 * We expect that the filesystem will ensure that the intervening pages read
@@ -554,37 +602,30 @@ _hash_expandtable(Relation rel, Buffer metabuf)
554602 * for the purpose. OTOH, adding a splitpoint is a very infrequent operation,
555603 * so it may not be worth worrying about.
556604 *
557- * Returnsthe first block number in the new splitpoint's range, or
558- *InvalidBlockNumber if allocation failed due to BlockNumber overflow.
605+ * ReturnsTRUE if successful, or FALSE if allocation failed due to
606+ * BlockNumber overflow.
559607 */
560- static BlockNumber
561- _hash_alloc_buckets (Relation rel ,uint32 nblocks )
608+ static bool
609+ _hash_alloc_buckets (Relation rel ,BlockNumber firstblock , uint32 nblocks )
562610{
563- BlockNumber firstblock ;
564611BlockNumber lastblock ;
565612char zerobuf [BLCKSZ ];
566613
567- /*
568- * Since we hold metapage lock, no one else is either splitting or
569- * allocating a new page in _hash_getovflpage(); hence it's safe to
570- * assume that the relation length isn't changing under us.
571- */
572- firstblock = RelationGetNumberOfBlocks (rel );
573614lastblock = firstblock + nblocks - 1 ;
574615
575616/*
576617 * Check for overflow in block number calculation; if so, we cannot
577618 * extend the index anymore.
578619 */
579620if (lastblock < firstblock || lastblock == InvalidBlockNumber )
580- return InvalidBlockNumber ;
621+ return false ;
581622
582623MemSet (zerobuf ,0 ,sizeof (zerobuf ));
583624
584- /* Note: we assume RelationGetNumberOfBlocks did RelationOpenSmgr for us */
625+ RelationOpenSmgr ( rel );
585626smgrextend (rel -> rd_smgr ,lastblock ,zerobuf ,rel -> rd_istemp );
586627
587- return firstblock ;
628+ return true ;
588629}
589630
590631