2121#include "utils/rel.h"
2222
2323
24- static Buffer _hash_getovflpage (Relation rel ,Buffer metabuf );
2524static uint32 _hash_firstfreebit (uint32 map );
2625
2726
@@ -113,13 +112,30 @@ _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin)
113112Page ovflpage ;
114113HashPageOpaque pageopaque ;
115114HashPageOpaque ovflopaque ;
116-
117- /* allocate and lock an empty overflow page */
118- ovflbuf = _hash_getovflpage (rel ,metabuf );
115+ HashMetaPage metap ;
116+ Buffer mapbuf = InvalidBuffer ;
117+ Buffer newmapbuf = InvalidBuffer ;
118+ BlockNumber blkno ;
119+ uint32 orig_firstfree ;
120+ uint32 splitnum ;
121+ uint32 * freep = NULL ;
122+ uint32 max_ovflpg ;
123+ uint32 bit ;
124+ uint32 bitmap_page_bit ;
125+ uint32 first_page ;
126+ uint32 last_bit ;
127+ uint32 last_page ;
128+ uint32 i ,
129+ j ;
130+ bool page_found = false;
119131
120132/*
121- * Write-lock the tail page. It is okay to hold two buffer locks here
122- * since there cannot be anyone else contending for access to ovflbuf.
133+ * Write-lock the tail page. Here, we need to maintain locking order such
134+ * that, first acquire the lock on tail page of bucket, then on meta page
135+ * to find and lock the bitmap page and if it is found, then lock on meta
136+ * page is released, then finally acquire the lock on new overflow buffer.
137+ * We need this locking order to avoid deadlock with backends that are
138+ * doing inserts.
123139 */
124140LockBuffer (buf ,BUFFER_LOCK_EXCLUSIVE );
125141
@@ -153,60 +169,6 @@ _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin)
153169buf = _hash_getbuf (rel ,nextblkno ,HASH_WRITE ,LH_OVERFLOW_PAGE );
154170}
155171
156- /* now that we have correct backlink, initialize new overflow page */
157- ovflpage = BufferGetPage (ovflbuf );
158- ovflopaque = (HashPageOpaque )PageGetSpecialPointer (ovflpage );
159- ovflopaque -> hasho_prevblkno = BufferGetBlockNumber (buf );
160- ovflopaque -> hasho_nextblkno = InvalidBlockNumber ;
161- ovflopaque -> hasho_bucket = pageopaque -> hasho_bucket ;
162- ovflopaque -> hasho_flag = LH_OVERFLOW_PAGE ;
163- ovflopaque -> hasho_page_id = HASHO_PAGE_ID ;
164-
165- MarkBufferDirty (ovflbuf );
166-
167- /* logically chain overflow page to previous page */
168- pageopaque -> hasho_nextblkno = BufferGetBlockNumber (ovflbuf );
169- MarkBufferDirty (buf );
170- if (retain_pin )
171- {
172- /* pin will be retained only for the primary bucket page */
173- Assert (pageopaque -> hasho_flag & LH_BUCKET_PAGE );
174- LockBuffer (buf ,BUFFER_LOCK_UNLOCK );
175- }
176- else
177- _hash_relbuf (rel ,buf );
178-
179- return ovflbuf ;
180- }
181-
182- /*
183- *_hash_getovflpage()
184- *
185- *Find an available overflow page and return it. The returned buffer
186- *is pinned and write-locked, and has had _hash_pageinit() applied,
187- *but it is caller's responsibility to fill the special space.
188- *
189- * The caller must hold a pin, but no lock, on the metapage buffer.
190- * That buffer is left in the same state at exit.
191- */
192- static Buffer
193- _hash_getovflpage (Relation rel ,Buffer metabuf )
194- {
195- HashMetaPage metap ;
196- Buffer mapbuf = 0 ;
197- Buffer newbuf ;
198- BlockNumber blkno ;
199- uint32 orig_firstfree ;
200- uint32 splitnum ;
201- uint32 * freep = NULL ;
202- uint32 max_ovflpg ;
203- uint32 bit ;
204- uint32 first_page ;
205- uint32 last_bit ;
206- uint32 last_page ;
207- uint32 i ,
208- j ;
209-
210172/* Get exclusive lock on the meta page */
211173LockBuffer (metabuf ,BUFFER_LOCK_EXCLUSIVE );
212174
@@ -255,11 +217,31 @@ _hash_getovflpage(Relation rel, Buffer metabuf)
255217for (;bit <=last_inpage ;j ++ ,bit += BITS_PER_MAP )
256218{
257219if (freep [j ]!= ALL_SET )
220+ {
221+ page_found = true;
222+
223+ /* Reacquire exclusive lock on the meta page */
224+ LockBuffer (metabuf ,BUFFER_LOCK_EXCLUSIVE );
225+
226+ /* convert bit to bit number within page */
227+ bit += _hash_firstfreebit (freep [j ]);
228+ bitmap_page_bit = bit ;
229+
230+ /* convert bit to absolute bit number */
231+ bit += (i <<BMPG_SHIFT (metap ));
232+ /* Calculate address of the recycled overflow page */
233+ blkno = bitno_to_blkno (metap ,bit );
234+
235+ /* Fetch and init the recycled page */
236+ ovflbuf = _hash_getinitbuf (rel ,blkno );
237+
258238gotofound ;
239+ }
259240}
260241
261242/* No free space here, try to advance to next map page */
262243_hash_relbuf (rel ,mapbuf );
244+ mapbuf = InvalidBuffer ;
263245i ++ ;
264246j = 0 ;/* scan from start of next map page */
265247bit = 0 ;
@@ -283,8 +265,15 @@ _hash_getovflpage(Relation rel, Buffer metabuf)
283265 * convenient to pre-mark them as "in use" too.
284266 */
285267bit = metap -> hashm_spares [splitnum ];
286- _hash_initbitmap (rel ,metap ,bitno_to_blkno (metap ,bit ),MAIN_FORKNUM );
287- metap -> hashm_spares [splitnum ]++ ;
268+
269+ /* metapage already has a write lock */
270+ if (metap -> hashm_nmaps >=HASH_MAX_BITMAPS )
271+ ereport (ERROR ,
272+ (errcode (ERRCODE_PROGRAM_LIMIT_EXCEEDED ),
273+ errmsg ("out of overflow pages in hash index \"%s\"" ,
274+ RelationGetRelationName (rel ))));
275+
276+ newmapbuf = _hash_getnewbuf (rel ,bitno_to_blkno (metap ,bit ),MAIN_FORKNUM );
288277}
289278else
290279{
@@ -295,49 +284,57 @@ _hash_getovflpage(Relation rel, Buffer metabuf)
295284}
296285
297286/* Calculate address of the new overflow page */
298- bit = metap -> hashm_spares [splitnum ];
287+ bit = BufferIsValid (newmapbuf ) ?
288+ metap -> hashm_spares [splitnum ]+ 1 :metap -> hashm_spares [splitnum ];
299289blkno = bitno_to_blkno (metap ,bit );
300290
301291/*
302292 * Fetch the page with _hash_getnewbuf to ensure smgr's idea of the
303293 * relation length stays in sync with ours. XXX It's annoying to do this
304294 * with metapage write lock held; would be better to use a lock that
305295 * doesn't block incoming searches.
296+ *
297+ * It is okay to hold two buffer locks here (one on tail page of bucket
298+ * and other on new overflow page) since there cannot be anyone else
299+ * contending for access to ovflbuf.
306300 */
307- newbuf = _hash_getnewbuf (rel ,blkno ,MAIN_FORKNUM );
301+ ovflbuf = _hash_getnewbuf (rel ,blkno ,MAIN_FORKNUM );
308302
309- metap -> hashm_spares [ splitnum ] ++ ;
303+ found :
310304
311305/*
312- * Adjust hashm_firstfree to avoid redundant searches. But don't risk
313- * changing it if someone moved it while we were searching bitmap pages.
306+ * Do the update.
314307 */
315- if (metap -> hashm_firstfree == orig_firstfree )
316- metap -> hashm_firstfree = bit + 1 ;
317-
318- /* Write updated metapage and release lock, but not pin */
319- MarkBufferDirty (metabuf );
320- LockBuffer (metabuf ,BUFFER_LOCK_UNLOCK );
321-
322- return newbuf ;
323-
324- found :
325- /* convert bit to bit number within page */
326- bit += _hash_firstfreebit (freep [j ]);
327-
328- /* mark page "in use" in the bitmap */
329- SETBIT (freep ,bit );
330- MarkBufferDirty (mapbuf );
331- _hash_relbuf (rel ,mapbuf );
308+ if (page_found )
309+ {
310+ Assert (BufferIsValid (mapbuf ));
332311
333- /* Reacquire exclusive lock on the meta page */
334- LockBuffer (metabuf ,BUFFER_LOCK_EXCLUSIVE );
312+ /* mark page "in use" in the bitmap */
313+ SETBIT (freep ,bitmap_page_bit );
314+ MarkBufferDirty (mapbuf );
315+ }
316+ else
317+ {
318+ /* update the count to indicate new overflow page is added */
319+ metap -> hashm_spares [splitnum ]++ ;
335320
336- /* convert bit to absolute bit number */
337- bit += (i <<BMPG_SHIFT (metap ));
321+ if (BufferIsValid (newmapbuf ))
322+ {
323+ _hash_initbitmapbuffer (newmapbuf ,metap -> hashm_bmsize , false);
324+ MarkBufferDirty (newmapbuf );
325+
326+ /* add the new bitmap page to the metapage's list of bitmaps */
327+ metap -> hashm_mapp [metap -> hashm_nmaps ]= BufferGetBlockNumber (newmapbuf );
328+ metap -> hashm_nmaps ++ ;
329+ metap -> hashm_spares [splitnum ]++ ;
330+ MarkBufferDirty (metabuf );
331+ }
338332
339- /* Calculate address of the recycled overflow page */
340- blkno = bitno_to_blkno (metap ,bit );
333+ /*
334+ * for new overflow page, we don't need to explicitly set the bit in
335+ * bitmap page, as by default that will be set to "in use".
336+ */
337+ }
341338
342339/*
343340 * Adjust hashm_firstfree to avoid redundant searches. But don't risk
@@ -346,19 +343,39 @@ _hash_getovflpage(Relation rel, Buffer metabuf)
346343if (metap -> hashm_firstfree == orig_firstfree )
347344{
348345metap -> hashm_firstfree = bit + 1 ;
349-
350- /* Write updated metapage and release lock, but not pin */
351346MarkBufferDirty (metabuf );
352- LockBuffer (metabuf ,BUFFER_LOCK_UNLOCK );
353347}
348+
349+ /* initialize new overflow page */
350+ ovflpage = BufferGetPage (ovflbuf );
351+ ovflopaque = (HashPageOpaque )PageGetSpecialPointer (ovflpage );
352+ ovflopaque -> hasho_prevblkno = BufferGetBlockNumber (buf );
353+ ovflopaque -> hasho_nextblkno = InvalidBlockNumber ;
354+ ovflopaque -> hasho_bucket = pageopaque -> hasho_bucket ;
355+ ovflopaque -> hasho_flag = LH_OVERFLOW_PAGE ;
356+ ovflopaque -> hasho_page_id = HASHO_PAGE_ID ;
357+
358+ MarkBufferDirty (ovflbuf );
359+
360+ /* logically chain overflow page to previous page */
361+ pageopaque -> hasho_nextblkno = BufferGetBlockNumber (ovflbuf );
362+
363+ MarkBufferDirty (buf );
364+
365+ if (retain_pin )
366+ LockBuffer (buf ,BUFFER_LOCK_UNLOCK );
354367else
355- {
356- /* We didn't change the metapage, so no need to write */
357- LockBuffer (metabuf ,BUFFER_LOCK_UNLOCK );
358- }
368+ _hash_relbuf (rel ,buf );
369+
370+ if (BufferIsValid (mapbuf ))
371+ _hash_relbuf (rel ,mapbuf );
372+
373+ LockBuffer (metabuf ,BUFFER_LOCK_UNLOCK );
374+
375+ if (BufferIsValid (newmapbuf ))
376+ _hash_relbuf (rel ,newmapbuf );
359377
360- /* Fetch, init, and return the recycled page */
361- return _hash_getinitbuf (rel ,blkno );
378+ return ovflbuf ;
362379}
363380
364381/*
@@ -615,6 +632,42 @@ _hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno,
615632}
616633
617634
635+ /*
636+ *_hash_initbitmapbuffer()
637+ *
638+ * Initialize a new bitmap page. All bits in the new bitmap page are set to
639+ * "1", indicating "in use".
640+ */
641+ void
642+ _hash_initbitmapbuffer (Buffer buf ,uint16 bmsize ,bool initpage )
643+ {
644+ Page pg ;
645+ HashPageOpaque op ;
646+ uint32 * freep ;
647+
648+ pg = BufferGetPage (buf );
649+
650+ /* initialize the page */
651+ if (initpage )
652+ _hash_pageinit (pg ,BufferGetPageSize (buf ));
653+
654+ /* initialize the page's special space */
655+ op = (HashPageOpaque )PageGetSpecialPointer (pg );
656+ op -> hasho_prevblkno = InvalidBlockNumber ;
657+ op -> hasho_nextblkno = InvalidBlockNumber ;
658+ op -> hasho_bucket = -1 ;
659+ op -> hasho_flag = LH_BITMAP_PAGE ;
660+ op -> hasho_page_id = HASHO_PAGE_ID ;
661+
662+ /* set all of the bits to 1 */
663+ freep = HashPageGetBitmap (pg );
664+ MemSet (freep ,0xFF ,bmsize );
665+
666+ /* Set pd_lower just past the end of the bitmap page data. */
667+ ((PageHeader )pg )-> pd_lower = ((char * )freep + bmsize )- (char * )pg ;
668+ }
669+
670+
618671/*
619672 *_hash_squeezebucket(rel, bucket)
620673 *