88 *
99 *
1010 * IDENTIFICATION
11- * $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.183 2004/12/31 22:00:49 pgsql Exp $
11+ * $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.184 2005/01/03 18:49:41 tgl Exp $
1212 *
1313 *-------------------------------------------------------------------------
1414 */
@@ -84,7 +84,7 @@ static Buffer ReadBufferInternal(Relation reln, BlockNumber blockNum,
8484bool bufferLockHeld );
8585static BufferDesc * BufferAlloc (Relation reln ,BlockNumber blockNum ,
8686bool * foundPtr );
87- static void FlushBuffer (BufferDesc * buf ,SMgrRelation reln );
87+ static void FlushBuffer (BufferDesc * buf ,SMgrRelation reln , bool earlylock );
8888static void write_buffer (Buffer buffer ,bool unpin );
8989
9090
@@ -340,6 +340,10 @@ BufferAlloc(Relation reln,
340340 * allocated -- ours. If it had a pin it wouldn't have been on
341341 * the free list. No one else could have pinned it between
342342 * StrategyGetBuffer and here because we have the BufMgrLock.
343+ *
344+ * (We must pin the buffer before releasing BufMgrLock ourselves,
345+ * to ensure StrategyGetBuffer won't give the same buffer to someone
346+ * else.)
343347 */
344348Assert (buf -> refcount == 0 );
345349buf -> refcount = 1 ;
@@ -367,9 +371,20 @@ BufferAlloc(Relation reln,
367371
368372/*
369373 * Write the buffer out, being careful to release BufMgrLock
370- * while doing the I/O.
374+ * while doing the I/O. We also tell FlushBuffer to share-lock
375+ * the buffer before releasing BufMgrLock. This is safe because
376+ * we know no other backend currently has the buffer pinned,
377+ * therefore no one can have it locked either, so we can always
378+ * get the lock without blocking. It is necessary because if
379+ * we release BufMgrLock first, it's possible for someone else
380+ * to pin and exclusive-lock the buffer before we get to the
381+ * share-lock, causing us to block. If the someone else then
382+ * blocks on a lock we hold, deadlock ensues. This has been
383+ * observed to happen when two backends are both trying to split
384+ * btree index pages, and the second one just happens to be
385+ * trying to split the page the first one got from the freelist.
371386 */
372- FlushBuffer (buf ,NULL );
387+ FlushBuffer (buf ,NULL , true );
373388
374389/*
375390 * Somebody could have allocated another buffer for the same
@@ -766,7 +781,7 @@ BufferSync(int percent, int maxpages)
766781PinBuffer (bufHdr , true);
767782StartBufferIO (bufHdr , false);
768783
769- FlushBuffer (bufHdr ,NULL );
784+ FlushBuffer (bufHdr ,NULL , false );
770785
771786TerminateBufferIO (bufHdr ,0 );
772787UnpinBuffer (bufHdr , true);
@@ -1018,11 +1033,16 @@ BufferGetFileNode(Buffer buffer)
10181033 * If the caller has an smgr reference for the buffer's relation, pass it
10191034 * as the second parameter. If not, pass NULL. (Do not open relation
10201035 * while holding BufMgrLock!)
1036+ *
1037+ * When earlylock is TRUE, we grab the per-buffer sharelock before releasing
1038+ * BufMgrLock, rather than after. Normally this would be a bad idea since
1039+ * we might deadlock, but it is safe and necessary when called from
1040+ * BufferAlloc() --- see comments therein.
10211041 */
10221042static void
1023- FlushBuffer (BufferDesc * buf ,SMgrRelation reln )
1043+ FlushBuffer (BufferDesc * buf ,SMgrRelation reln , bool earlylock )
10241044{
1025- Buffer buffer ;
1045+ Buffer buffer = BufferDescriptorGetBuffer ( buf ) ;
10261046XLogRecPtr recptr ;
10271047ErrorContextCallback errcontext ;
10281048
@@ -1033,6 +1053,13 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln)
10331053/* To check if block content changed while flushing. - vadim 01/17/97 */
10341054buf -> flags &= ~BM_JUST_DIRTIED ;
10351055
1056+ /*
1057+ * If earlylock, grab buffer sharelock before anyone else could re-lock
1058+ * the buffer.
1059+ */
1060+ if (earlylock )
1061+ LockBuffer (buffer ,BUFFER_LOCK_SHARE );
1062+
10361063/* Release BufMgrLock while doing xlog work */
10371064LWLockRelease (BufMgrLock );
10381065
@@ -1046,14 +1073,13 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln)
10461073if (reln == NULL )
10471074reln = smgropen (buf -> tag .rnode );
10481075
1049- buffer = BufferDescriptorGetBuffer (buf );
1050-
10511076/*
10521077 * Protect buffer content against concurrent update. (Note that
10531078 * hint-bit updates can still occur while the write is in progress,
10541079 * but we assume that that will not invalidate the data written.)
10551080 */
1056- LockBuffer (buffer ,BUFFER_LOCK_SHARE );
1081+ if (!earlylock )
1082+ LockBuffer (buffer ,BUFFER_LOCK_SHARE );
10571083
10581084/*
10591085 * Force XLOG flush for buffer' LSN. This implements the basic WAL
@@ -1485,7 +1511,7 @@ FlushRelationBuffers(Relation rel, BlockNumber firstDelBlock)
14851511{
14861512StartBufferIO (bufHdr , false);
14871513
1488- FlushBuffer (bufHdr ,rel -> rd_smgr );
1514+ FlushBuffer (bufHdr ,rel -> rd_smgr , false );
14891515
14901516TerminateBufferIO (bufHdr ,0 );
14911517}