@@ -302,6 +302,11 @@ static bool doPageWrites;
302302 * so it's a plain spinlock. The other locks are held longer (potentially
303303 * over I/O operations), so we use LWLocks for them. These locks are:
304304 *
305+ * WALBufMappingLock: must be held to replace a page in the WAL buffer cache.
306+ * It is only held while initializing and changing the mapping. If the
307+ * contents of the buffer being replaced haven't been written yet, the mapping
308+ * lock is released while the write is done, and reacquired afterwards.
309+ *
305310 * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
306311 * XLogFlush).
307312 *
@@ -468,32 +473,21 @@ typedef struct XLogCtlData
468473pg_atomic_uint64 logFlushResult ;/* last byte + 1 flushed */
469474
470475/*
471- * Latest reserved for inititalization page in the cache (last byte
472- * position + 1).
476+ * Latest initialized page in the cache (last byte position + 1).
473477 *
474- * To change the identity of a buffer, you need to advance
475- *InitializeReserved first . To change the identity of a buffer that's
478+ * To change the identity of a buffer (and InitializedUpTo) , you need to
479+ *hold WALBufMappingLock . To change the identity of a buffer that's
476480 * still dirty, the old page needs to be written out first, and for that
477481 * you need WALWriteLock, and you need to ensure that there are no
478482 * in-progress insertions to the page by calling
479483 * WaitXLogInsertionsToFinish().
480484 */
481- pg_atomic_uint64 InitializeReserved ;
482-
483- /*
484- * Latest initialized page in the cache (last byte position + 1).
485- *
486- * InitializedUpTo is updated after the buffer initialization. After
487- * update, waiters got notification using InitializedUpToCondVar.
488- */
489- pg_atomic_uint64 InitializedUpTo ;
490- ConditionVariable InitializedUpToCondVar ;
485+ XLogRecPtr InitializedUpTo ;
491486
492487/*
493488 * These values do not change after startup, although the pointed-to pages
494- * and xlblocks values certainly do. xlblocks values are changed
495- * lock-free according to the check for the xlog write position and are
496- * accompanied by changes of InitializeReserved and InitializedUpTo.
489+ * and xlblocks values certainly do. xlblocks values are protected by
490+ * WALBufMappingLock.
497491 */
498492char * pages ;/* buffers for unwritten XLOG pages */
499493pg_atomic_uint64 * xlblocks ;/* 1st byte ptr-s + XLOG_BLCKSZ */
@@ -816,9 +810,9 @@ XLogInsertRecord(XLogRecData *rdata,
816810 * fullPageWrites from changing until the insertion is finished.
817811 *
818812 * Step 2 can usually be done completely in parallel. If the required WAL
819- * page is not initialized yet, you have togo through AdvanceXLInsertBuffer,
820- *which will ensure it is initialized. But the WAL writer tries to do that
821- *ahead of insertions to avoid that from happening in the critical path.
813+ * page is not initialized yet, you have tograb WALBufMappingLock to
814+ *initialize it, but the WAL writer tries to do that ahead of insertions
815+ * to avoid that from happening in the critical path.
822816 *
823817 *----------
824818 */
@@ -1997,70 +1991,32 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic)
19971991XLogRecPtr NewPageEndPtr = InvalidXLogRecPtr ;
19981992XLogRecPtr NewPageBeginPtr ;
19991993XLogPageHeader NewPage ;
2000- XLogRecPtr ReservedPtr ;
20011994int npages pg_attribute_unused ()= 0 ;
20021995
2003- /*
2004- * We must run the loop below inside the critical section as we expect
2005- * XLogCtl->InitializedUpTo to eventually keep up. The most of callers
2006- * already run inside the critical section. Except for WAL writer, which
2007- * passed 'opportunistic == true', and therefore we don't perform
2008- * operations that could error out.
2009- *
2010- * Start an explicit critical section anyway though.
2011- */
2012- Assert (CritSectionCount > 0 || opportunistic );
2013- START_CRIT_SECTION ();
1996+ LWLockAcquire (WALBufMappingLock ,LW_EXCLUSIVE );
20141997
2015- /*--
2016- * Loop till we get all the pages in WAL buffer before 'upto' reserved for
2017- * initialization. Multiple process can initialize different buffers with
2018- * this loop in parallel as following.
2019- *
2020- * 1. Reserve page for initialization using XLogCtl->InitializeReserved.
2021- * 2. Initialize the reserved page.
2022- * 3. Attempt to advance XLogCtl->InitializedUpTo,
1998+ /*
1999+ * Now that we have the lock, check if someone initialized the page
2000+ * already.
20232001 */
2024- ReservedPtr = pg_atomic_read_u64 (& XLogCtl -> InitializeReserved );
2025- while (upto >=ReservedPtr || opportunistic )
2002+ while (upto >=XLogCtl -> InitializedUpTo || opportunistic )
20262003{
2027- Assert ( ReservedPtr % XLOG_BLCKSZ == 0 );
2004+ nextidx = XLogRecPtrToBufIdx ( XLogCtl -> InitializedUpTo );
20282005
20292006/*
2030- * Get ending-offset of the buffer page we need to replace.
2031- *
2032- * We don't lookup into xlblocks, but rather calculate position we
2033- * must wait to be written. If it was written, xlblocks will have this
2034- * position (or uninitialized)
2007+ * Get ending-offset of the buffer page we need to replace (this may
2008+ * be zero if the buffer hasn't been used yet). Fall through if it's
2009+ * already written out.
20352010 */
2036- if (ReservedPtr + XLOG_BLCKSZ > XLOG_BLCKSZ * XLOGbuffers )
2037- OldPageRqstPtr = ReservedPtr + XLOG_BLCKSZ - XLOG_BLCKSZ * XLOGbuffers ;
2038- else
2039- OldPageRqstPtr = InvalidXLogRecPtr ;
2040-
2041- if (LogwrtResult .Write < OldPageRqstPtr && opportunistic )
2011+ OldPageRqstPtr = pg_atomic_read_u64 (& XLogCtl -> xlblocks [nextidx ]);
2012+ if (LogwrtResult .Write < OldPageRqstPtr )
20422013{
20432014/*
2044- * If we just want to pre-initialize as much as we can without
2045- * flushing, give up now.
2015+ *Nope, got work to do. If we just want to pre-initialize as much
2016+ *as we can without flushing, give up now.
20462017 */
2047- upto = ReservedPtr - 1 ;
2048- break ;
2049- }
2050-
2051- /*
2052- * Attempt to reserve the page for initialization. Failure means that
2053- * this page got reserved by another process.
2054- */
2055- if (!pg_atomic_compare_exchange_u64 (& XLogCtl -> InitializeReserved ,
2056- & ReservedPtr ,
2057- ReservedPtr + XLOG_BLCKSZ ))
2058- continue ;
2059-
2060- /* Fall through if it's already written out. */
2061- if (LogwrtResult .Write < OldPageRqstPtr )
2062- {
2063- /* Nope, got work to do. */
2018+ if (opportunistic )
2019+ break ;
20642020
20652021/* Advance shared memory write request position */
20662022SpinLockAcquire (& XLogCtl -> info_lck );
@@ -2075,6 +2031,14 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic)
20752031RefreshXLogWriteResult (LogwrtResult );
20762032if (LogwrtResult .Write < OldPageRqstPtr )
20772033{
2034+ /*
2035+ * Must acquire write lock. Release WALBufMappingLock first,
2036+ * to make sure that all insertions that we need to wait for
2037+ * can finish (up to this same position). Otherwise we risk
2038+ * deadlock.
2039+ */
2040+ LWLockRelease (WALBufMappingLock );
2041+
20782042WaitXLogInsertionsToFinish (OldPageRqstPtr );
20792043
20802044LWLockAcquire (WALWriteLock ,LW_EXCLUSIVE );
@@ -2096,24 +2060,20 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic)
20962060pgWalUsage .wal_buffers_full ++ ;
20972061TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE ();
20982062}
2063+ /* Re-acquire WALBufMappingLock and retry */
2064+ LWLockAcquire (WALBufMappingLock ,LW_EXCLUSIVE );
2065+ continue ;
20992066}
21002067}
21012068
21022069/*
21032070 * Now the next buffer slot is free and we can set it up to be the
21042071 * next output page.
21052072 */
2106- NewPageBeginPtr = ReservedPtr ;
2073+ NewPageBeginPtr = XLogCtl -> InitializedUpTo ;
21072074NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ ;
2108- nextidx = XLogRecPtrToBufIdx (ReservedPtr );
21092075
2110- #ifdef USE_ASSERT_CHECKING
2111- {
2112- XLogRecPtr storedBound = pg_atomic_read_u64 (& XLogCtl -> xlblocks [nextidx ]);
2113-
2114- Assert (storedBound == OldPageRqstPtr || storedBound == InvalidXLogRecPtr );
2115- }
2116- #endif
2076+ Assert (XLogRecPtrToBufIdx (NewPageBeginPtr )== nextidx );
21172077
21182078NewPage = (XLogPageHeader ) (XLogCtl -> pages + nextidx * (Size )XLOG_BLCKSZ );
21192079
@@ -2179,50 +2139,11 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic)
21792139pg_write_barrier ();
21802140
21812141pg_atomic_write_u64 (& XLogCtl -> xlblocks [nextidx ],NewPageEndPtr );
2182-
2183- /*
2184- * Try to advance XLogCtl->InitializedUpTo.
2185- *
2186- * If the CAS operation failed, then some of previous pages are not
2187- * initialized yet, and this backend gives up.
2188- *
2189- * Since initializer of next page might give up on advancing of
2190- * InitializedUpTo, this backend have to attempt advancing until it
2191- * find page "in the past" or concurrent backend succeeded at
2192- * advancing. When we finish advancing XLogCtl->InitializedUpTo, we
2193- * notify all the waiters with XLogCtl->InitializedUpToCondVar.
2194- */
2195- while (pg_atomic_compare_exchange_u64 (& XLogCtl -> InitializedUpTo ,& NewPageBeginPtr ,NewPageEndPtr ))
2196- {
2197- NewPageBeginPtr = NewPageEndPtr ;
2198- NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ ;
2199- nextidx = XLogRecPtrToBufIdx (NewPageBeginPtr );
2200-
2201- if (pg_atomic_read_u64 (& XLogCtl -> xlblocks [nextidx ])!= NewPageEndPtr )
2202- {
2203- /*
2204- * Page at nextidx wasn't initialized yet, so we cann't move
2205- * InitializedUpto further. It will be moved by backend which
2206- * will initialize nextidx.
2207- */
2208- ConditionVariableBroadcast (& XLogCtl -> InitializedUpToCondVar );
2209- break ;
2210- }
2211- }
2142+ XLogCtl -> InitializedUpTo = NewPageEndPtr ;
22122143
22132144npages ++ ;
22142145}
2215-
2216- END_CRIT_SECTION ();
2217-
2218- /*
2219- * All the pages in WAL buffer before 'upto' were reserved for
2220- * initialization. However, some pages might be reserved by concurrent
2221- * processes. Wait till they finish initialization.
2222- */
2223- while (upto >=pg_atomic_read_u64 (& XLogCtl -> InitializedUpTo ))
2224- ConditionVariableSleep (& XLogCtl -> InitializedUpToCondVar ,WAIT_EVENT_WAL_BUFFER_INIT );
2225- ConditionVariableCancelSleep ();
2146+ LWLockRelease (WALBufMappingLock );
22262147
22272148#ifdef WAL_DEBUG
22282149if (XLOG_DEBUG && npages > 0 )
@@ -5123,10 +5044,6 @@ XLOGShmemInit(void)
51235044pg_atomic_init_u64 (& XLogCtl -> logWriteResult ,InvalidXLogRecPtr );
51245045pg_atomic_init_u64 (& XLogCtl -> logFlushResult ,InvalidXLogRecPtr );
51255046pg_atomic_init_u64 (& XLogCtl -> unloggedLSN ,InvalidXLogRecPtr );
5126-
5127- pg_atomic_init_u64 (& XLogCtl -> InitializeReserved ,InvalidXLogRecPtr );
5128- pg_atomic_init_u64 (& XLogCtl -> InitializedUpTo ,InvalidXLogRecPtr );
5129- ConditionVariableInit (& XLogCtl -> InitializedUpToCondVar );
51305047}
51315048
51325049/*
@@ -6146,7 +6063,7 @@ StartupXLOG(void)
61466063memset (page + len ,0 ,XLOG_BLCKSZ - len );
61476064
61486065pg_atomic_write_u64 (& XLogCtl -> xlblocks [firstIdx ],endOfRecoveryInfo -> lastPageBeginPtr + XLOG_BLCKSZ );
6149- pg_atomic_write_u64 ( & XLogCtl -> InitializedUpTo , endOfRecoveryInfo -> lastPageBeginPtr + XLOG_BLCKSZ ) ;
6066+ XLogCtl -> InitializedUpTo = endOfRecoveryInfo -> lastPageBeginPtr + XLOG_BLCKSZ ;
61506067}
61516068else
61526069{
@@ -6155,9 +6072,8 @@ StartupXLOG(void)
61556072 * let the first attempt to insert a log record to initialize the next
61566073 * buffer.
61576074 */
6158- pg_atomic_write_u64 ( & XLogCtl -> InitializedUpTo , EndOfLog ) ;
6075+ XLogCtl -> InitializedUpTo = EndOfLog ;
61596076}
6160- pg_atomic_write_u64 (& XLogCtl -> InitializeReserved ,pg_atomic_read_u64 (& XLogCtl -> InitializedUpTo ));
61616077
61626078/*
61636079 * Update local and shared status. This is OK to do without any locks