1919 * reading in or writing out a page buffer does not hold the control lock,
2020 * only the per-buffer lock for the buffer it is working on.
2121 *
22+ * "Holding the control lock" means exclusive lock in all cases except for
23+ * SimpleLruReadPage_ReadOnly(); see comments for SlruRecentlyUsed() for
24+ * the implications of that.
25+ *
2226 * When initiating I/O on a buffer, we acquire the per-buffer lock exclusively
2327 * before releasing the control lock. The per-buffer lock is released after
2428 * completing the I/O, re-acquiring the control lock, and updating the shared
3741 * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
3842 * Portions Copyright (c) 1994, Regents of the University of California
3943 *
40- * $PostgreSQL: pgsql/src/backend/access/transam/slru.c,v 1.31 2005/11/22 18:17:07 momjian Exp $
44+ * $PostgreSQL: pgsql/src/backend/access/transam/slru.c,v 1.32 2005/12/06 18:10:06 tgl Exp $
4145 *
4246 *-------------------------------------------------------------------------
4347 */
@@ -91,15 +95,30 @@ typedef struct SlruFlushData
9195}SlruFlushData ;
9296
9397/*
94- * Macro to mark a buffer slot "most recently used".
98+ * Macro to mark a buffer slot "most recently used". Note multiple evaluation
99+ * of arguments!
100+ *
101+ * The reason for the if-test is that there are often many consecutive
102+ * accesses to the same page (particularly the latest page). By suppressing
103+ * useless increments of cur_lru_count, we reduce the probability that old
104+ * pages' counts will "wrap around" and make them appear recently used.
105+ *
106+ * We allow this code to be executed concurrently by multiple processes within
107+ * SimpleLruReadPage_ReadOnly(). As long as int reads and writes are atomic,
108+ * this should not cause any completely-bogus values to enter the computation.
109+ * However, it is possible for either cur_lru_count or individual
110+ * page_lru_count entries to be "reset" to lower values than they should have,
111+ * in case a process is delayed while it executes this macro. With care in
112+ * SlruSelectLRUPage(), this does little harm, and in any case the absolute
113+ * worst possible consequence is a nonoptimal choice of page to evict. The
114+ * gain from allowing concurrent reads of SLRU pages seems worth it.
95115 */
96116#define SlruRecentlyUsed (shared ,slotno )\
97117do { \
98- if ((shared)->page_lru_count[slotno] != 0) { \
99- intiilru; \
100- for (iilru = 0; iilru < NUM_SLRU_BUFFERS; iilru++) \
101- (shared)->page_lru_count[iilru]++; \
102- (shared)->page_lru_count[slotno] = 0; \
118+ intnew_lru_count = (shared)->cur_lru_count; \
119+ if (new_lru_count != (shared)->page_lru_count[slotno]) { \
120+ (shared)->cur_lru_count = ++new_lru_count; \
121+ (shared)->page_lru_count[slotno] = new_lru_count; \
103122} \
104123} while (0)
105124
@@ -165,7 +184,7 @@ SimpleLruInit(SlruCtl ctl, const char *name,
165184shared -> page_buffer [slotno ]= bufptr ;
166185shared -> page_status [slotno ]= SLRU_PAGE_EMPTY ;
167186shared -> page_dirty [slotno ]= false;
168- shared -> page_lru_count [slotno ]= 1 ;
187+ shared -> page_lru_count [slotno ]= 0 ;
169188shared -> buffer_locks [slotno ]= LWLockAssign ();
170189bufptr += BLCKSZ ;
171190}
@@ -351,6 +370,49 @@ SimpleLruReadPage(SlruCtl ctl, int pageno, TransactionId xid)
351370}
352371}
353372
373+ /*
374+ * Find a page in a shared buffer, reading it in if necessary.
375+ * The page number must correspond to an already-initialized page.
376+ * The caller must intend only read-only access to the page.
377+ *
378+ * The passed-in xid is used only for error reporting, and may be
379+ * InvalidTransactionId if no specific xid is associated with the action.
380+ *
381+ * Return value is the shared-buffer slot number now holding the page.
382+ * The buffer's LRU access info is updated.
383+ *
384+ * Control lock must NOT be held at entry, but will be held at exit.
385+ * It is unspecified whether the lock will be shared or exclusive.
386+ */
387+ int
388+ SimpleLruReadPage_ReadOnly (SlruCtl ctl ,int pageno ,TransactionId xid )
389+ {
390+ SlruShared shared = ctl -> shared ;
391+ int slotno ;
392+
393+ /* Try to find the page while holding only shared lock */
394+ LWLockAcquire (shared -> ControlLock ,LW_SHARED );
395+
396+ /* See if page is already in a buffer */
397+ for (slotno = 0 ;slotno < NUM_SLRU_BUFFERS ;slotno ++ )
398+ {
399+ if (shared -> page_number [slotno ]== pageno &&
400+ shared -> page_status [slotno ]!= SLRU_PAGE_EMPTY &&
401+ shared -> page_status [slotno ]!= SLRU_PAGE_READ_IN_PROGRESS )
402+ {
403+ /* See comments for SlruRecentlyUsed macro */
404+ SlruRecentlyUsed (shared ,slotno );
405+ return slotno ;
406+ }
407+ }
408+
409+ /* No luck, so switch to normal exclusive lock and do regular read */
410+ LWLockRelease (shared -> ControlLock );
411+ LWLockAcquire (shared -> ControlLock ,LW_EXCLUSIVE );
412+
413+ return SimpleLruReadPage (ctl ,pageno ,xid );
414+ }
415+
354416/*
355417 * Write a page from a shared buffer, if necessary.
356418 * Does nothing if the specified slot is not dirty.
@@ -729,8 +791,10 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno)
729791for (;;)
730792{
731793int slotno ;
732- int bestslot = 0 ;
733- unsignedint bestcount = 0 ;
794+ int cur_count ;
795+ int bestslot ;
796+ int best_delta ;
797+ int best_page_number ;
734798
735799/* See if page already has a buffer assigned */
736800for (slotno = 0 ;slotno < NUM_SLRU_BUFFERS ;slotno ++ )
@@ -742,17 +806,59 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno)
742806
743807/*
744808 * If we find any EMPTY slot, just select that one. Else locate the
745- * least-recently-used slot that isn't the latest page.
809+ * least-recently-used slot to replace.
810+ *
811+ * Normally the page_lru_count values will all be different and so
812+ * there will be a well-defined LRU page. But since we allow
813+ * concurrent execution of SlruRecentlyUsed() within
814+ * SimpleLruReadPage_ReadOnly(), it is possible that multiple pages
815+ * acquire the same lru_count values. In that case we break ties by
816+ * choosing the furthest-back page.
817+ *
818+ * In no case will we select the slot containing latest_page_number
819+ * for replacement, even if it appears least recently used.
820+ *
821+ * Notice that this next line forcibly advances cur_lru_count to a
822+ * value that is certainly beyond any value that will be in the
823+ * page_lru_count array after the loop finishes. This ensures that
824+ * the next execution of SlruRecentlyUsed will mark the page newly
825+ * used, even if it's for a page that has the current counter value.
826+ * That gets us back on the path to having good data when there are
827+ * multiple pages with the same lru_count.
746828 */
829+ cur_count = (shared -> cur_lru_count )++ ;
830+ best_delta = -1 ;
831+ bestslot = 0 ;/* no-op, just keeps compiler quiet */
832+ best_page_number = 0 ;/* ditto */
747833for (slotno = 0 ;slotno < NUM_SLRU_BUFFERS ;slotno ++ )
748834{
835+ int this_delta ;
836+ int this_page_number ;
837+
749838if (shared -> page_status [slotno ]== SLRU_PAGE_EMPTY )
750839return slotno ;
751- if (shared -> page_lru_count [slotno ]> bestcount &&
752- shared -> page_number [slotno ]!= shared -> latest_page_number )
840+ this_delta = cur_count - shared -> page_lru_count [slotno ];
841+ if (this_delta < 0 )
842+ {
843+ /*
844+ * Clean up in case shared updates have caused cur_count
845+ * increments to get "lost". We back off the page counts,
846+ * rather than trying to increase cur_count, to avoid any
847+ * question of infinite loops or failure in the presence of
848+ * wrapped-around counts.
849+ */
850+ shared -> page_lru_count [slotno ]= cur_count ;
851+ this_delta = 0 ;
852+ }
853+ this_page_number = shared -> page_number [slotno ];
854+ if ((this_delta > best_delta ||
855+ (this_delta == best_delta &&
856+ ctl -> PagePrecedes (this_page_number ,best_page_number )))&&
857+ this_page_number != shared -> latest_page_number )
753858{
754859bestslot = slotno ;
755- bestcount = shared -> page_lru_count [slotno ];
860+ best_delta = this_delta ;
861+ best_page_number = this_page_number ;
756862}
757863}
758864