5252 * not clear this helps much, but it can't hurt. (XXX perhaps a LIFO
5353 * policy for free blocks would be better?)
5454 *
55+ * To further make the I/Os more sequential, we can use a larger buffer
56+ * when reading, and read multiple blocks from the same tape in one go,
57+ * whenever the buffer becomes empty. LogicalTapeAssignReadBufferSize()
58+ * can be used to set the size of the read buffer.
59+ *
5560 * To support the above policy of writing to the lowest free block,
5661 * ltsGetFreeBlock sorts the list of free block numbers into decreasing
5762 * order each time it is asked for a block and the list isn't currently
5863 * sorted. This is an efficient way to handle it because we expect cycles
5964 * of releasing many blocks followed by re-using many blocks, due to
60- *tuplesort.c's "preread" behavior .
65+ *the larger read buffer .
6166 *
6267 * Since all the bookkeeping and buffer memory is allocated with palloc(),
6368 * and the underlying file(s) are made with OpenTemporaryFile, all resources
7984
8085#include "storage/buffile.h"
8186#include "utils/logtape.h"
87+ #include "utils/memutils.h"
8288
8389/*
8490 * Block indexes are "long"s, so we can fit this many per indirect block.
@@ -131,9 +137,18 @@ typedef struct LogicalTape
131137 * reading.
132138 */
133139char * buffer ;/* physical buffer (separately palloc'd) */
140+ int buffer_size ;/* allocated size of the buffer */
134141long curBlockNumber ;/* this block's logical blk# within tape */
135142int pos ;/* next read/write position in buffer */
136143int nbytes ;/* total # of valid bytes in buffer */
144+
145+ /*
146+ * Desired buffer size to use when reading. To keep things simple, we use
147+ * a single-block buffer when writing, or when reading a frozen tape. But
148+ * when we are reading and will only read forwards, we allocate a larger
149+ * buffer, determined by read_buffer_size.
150+ */
151+ int read_buffer_size ;
137152}LogicalTape ;
138153
139154/*
@@ -227,6 +242,53 @@ ltsReadBlock(LogicalTapeSet *lts, long blocknum, void *buffer)
227242blocknum )));
228243}
229244
245+ /*
246+ * Read as many blocks as we can into the per-tape buffer.
247+ *
248+ * The caller can specify the next physical block number to read, in
249+ * datablocknum, or -1 to fetch the next block number from the internal block.
250+ * If datablocknum == -1, the caller must've already set curBlockNumber.
251+ *
252+ * Returns true if anything was read, 'false' on EOF.
253+ */
254+ static bool
255+ ltsReadFillBuffer (LogicalTapeSet * lts ,LogicalTape * lt ,long datablocknum )
256+ {
257+ lt -> pos = 0 ;
258+ lt -> nbytes = 0 ;
259+
260+ do
261+ {
262+ /* Fetch next block number (unless provided by caller) */
263+ if (datablocknum == -1 )
264+ {
265+ datablocknum = ltsRecallNextBlockNum (lts ,lt -> indirect ,lt -> frozen );
266+ if (datablocknum == -1L )
267+ break ;/* EOF */
268+ lt -> curBlockNumber ++ ;
269+ }
270+
271+ /* Read the block */
272+ ltsReadBlock (lts ,datablocknum , (void * ) (lt -> buffer + lt -> nbytes ));
273+ if (!lt -> frozen )
274+ ltsReleaseBlock (lts ,datablocknum );
275+
276+ if (lt -> curBlockNumber < lt -> numFullBlocks )
277+ lt -> nbytes += BLCKSZ ;
278+ else
279+ {
280+ /* EOF */
281+ lt -> nbytes += lt -> lastBlockBytes ;
282+ break ;
283+ }
284+
285+ /* Advance to next block, if we have buffer space left */
286+ datablocknum = -1 ;
287+ }while (lt -> nbytes < lt -> buffer_size );
288+
289+ return (lt -> nbytes > 0 );
290+ }
291+
230292/*
231293 * qsort comparator for sorting freeBlocks[] into decreasing order.
232294 */
@@ -546,6 +608,8 @@ LogicalTapeSetCreate(int ntapes)
546608lt -> numFullBlocks = 0L ;
547609lt -> lastBlockBytes = 0 ;
548610lt -> buffer = NULL ;
611+ lt -> buffer_size = 0 ;
612+ lt -> read_buffer_size = BLCKSZ ;
549613lt -> curBlockNumber = 0L ;
550614lt -> pos = 0 ;
551615lt -> nbytes = 0 ;
@@ -628,14 +692,18 @@ LogicalTapeWrite(LogicalTapeSet *lts, int tapenum,
628692
629693/* Allocate data buffer and first indirect block on first write */
630694if (lt -> buffer == NULL )
695+ {
631696lt -> buffer = (char * )palloc (BLCKSZ );
697+ lt -> buffer_size = BLCKSZ ;
698+ }
632699if (lt -> indirect == NULL )
633700{
634701lt -> indirect = (IndirectBlock * )palloc (sizeof (IndirectBlock ));
635702lt -> indirect -> nextSlot = 0 ;
636703lt -> indirect -> nextup = NULL ;
637704}
638705
706+ Assert (lt -> buffer_size == BLCKSZ );
639707while (size > 0 )
640708{
641709if (lt -> pos >=BLCKSZ )
@@ -709,18 +777,19 @@ LogicalTapeRewind(LogicalTapeSet *lts, int tapenum, bool forWrite)
709777Assert (lt -> frozen );
710778datablocknum = ltsRewindFrozenIndirectBlock (lts ,lt -> indirect );
711779}
780+
781+ /* Allocate a read buffer */
782+ if (lt -> buffer )
783+ pfree (lt -> buffer );
784+ lt -> buffer = palloc (lt -> read_buffer_size );
785+ lt -> buffer_size = lt -> read_buffer_size ;
786+
712787/* Read the first block, or reset if tape is empty */
713788lt -> curBlockNumber = 0L ;
714789lt -> pos = 0 ;
715790lt -> nbytes = 0 ;
716791if (datablocknum != -1L )
717- {
718- ltsReadBlock (lts ,datablocknum , (void * )lt -> buffer );
719- if (!lt -> frozen )
720- ltsReleaseBlock (lts ,datablocknum );
721- lt -> nbytes = (lt -> curBlockNumber < lt -> numFullBlocks ) ?
722- BLCKSZ :lt -> lastBlockBytes ;
723- }
792+ ltsReadFillBuffer (lts ,lt ,datablocknum );
724793}
725794else
726795{
@@ -754,6 +823,13 @@ LogicalTapeRewind(LogicalTapeSet *lts, int tapenum, bool forWrite)
754823lt -> curBlockNumber = 0L ;
755824lt -> pos = 0 ;
756825lt -> nbytes = 0 ;
826+
827+ if (lt -> buffer )
828+ {
829+ pfree (lt -> buffer );
830+ lt -> buffer = NULL ;
831+ lt -> buffer_size = 0 ;
832+ }
757833}
758834}
759835
@@ -779,20 +855,8 @@ LogicalTapeRead(LogicalTapeSet *lts, int tapenum,
779855if (lt -> pos >=lt -> nbytes )
780856{
781857/* Try to load more data into buffer. */
782- long datablocknum = ltsRecallNextBlockNum (lts ,lt -> indirect ,
783- lt -> frozen );
784-
785- if (datablocknum == -1L )
858+ if (!ltsReadFillBuffer (lts ,lt ,-1 ))
786859break ;/* EOF */
787- lt -> curBlockNumber ++ ;
788- lt -> pos = 0 ;
789- ltsReadBlock (lts ,datablocknum , (void * )lt -> buffer );
790- if (!lt -> frozen )
791- ltsReleaseBlock (lts ,datablocknum );
792- lt -> nbytes = (lt -> curBlockNumber < lt -> numFullBlocks ) ?
793- BLCKSZ :lt -> lastBlockBytes ;
794- if (lt -> nbytes <=0 )
795- break ;/* EOF (possible here?) */
796860}
797861
798862nthistime = lt -> nbytes - lt -> pos ;
@@ -842,6 +906,22 @@ LogicalTapeFreeze(LogicalTapeSet *lts, int tapenum)
842906lt -> writing = false;
843907lt -> frozen = true;
844908datablocknum = ltsRewindIndirectBlock (lts ,lt -> indirect , true);
909+
910+ /*
911+ * The seek and backspace functions assume a single block read buffer.
912+ * That's OK with current usage. A larger buffer is helpful to make the
913+ * read pattern of the backing file look more sequential to the OS, when
914+ * we're reading from multiple tapes. But at the end of a sort, when a
915+ * tape is frozen, we only read from a single tape anyway.
916+ */
917+ if (!lt -> buffer || lt -> buffer_size != BLCKSZ )
918+ {
919+ if (lt -> buffer )
920+ pfree (lt -> buffer );
921+ lt -> buffer = palloc (BLCKSZ );
922+ lt -> buffer_size = BLCKSZ ;
923+ }
924+
845925/* Read the first block, or reset if tape is empty */
846926lt -> curBlockNumber = 0L ;
847927lt -> pos = 0 ;
@@ -875,6 +955,7 @@ LogicalTapeBackspace(LogicalTapeSet *lts, int tapenum, size_t size)
875955Assert (tapenum >=0 && tapenum < lts -> nTapes );
876956lt = & lts -> tapes [tapenum ];
877957Assert (lt -> frozen );
958+ Assert (lt -> buffer_size == BLCKSZ );
878959
879960/*
880961 * Easy case for seek within current block.
@@ -941,6 +1022,7 @@ LogicalTapeSeek(LogicalTapeSet *lts, int tapenum,
9411022lt = & lts -> tapes [tapenum ];
9421023Assert (lt -> frozen );
9431024Assert (offset >=0 && offset <=BLCKSZ );
1025+ Assert (lt -> buffer_size == BLCKSZ );
9441026
9451027/*
9461028 * Easy case for seek within current block.
@@ -1002,6 +1084,10 @@ LogicalTapeTell(LogicalTapeSet *lts, int tapenum,
10021084
10031085Assert (tapenum >=0 && tapenum < lts -> nTapes );
10041086lt = & lts -> tapes [tapenum ];
1087+
1088+ /* With a larger buffer, 'pos' wouldn't be the same as offset within page */
1089+ Assert (lt -> buffer_size == BLCKSZ );
1090+
10051091* blocknum = lt -> curBlockNumber ;
10061092* offset = lt -> pos ;
10071093}
@@ -1014,3 +1100,28 @@ LogicalTapeSetBlocks(LogicalTapeSet *lts)
10141100{
10151101return lts -> nFileBlocks ;
10161102}
1103+
1104+ /*
1105+ * Set buffer size to use, when reading from given tape.
1106+ */
1107+ void
1108+ LogicalTapeAssignReadBufferSize (LogicalTapeSet * lts ,int tapenum ,size_t avail_mem )
1109+ {
1110+ LogicalTape * lt ;
1111+
1112+ Assert (tapenum >=0 && tapenum < lts -> nTapes );
1113+ lt = & lts -> tapes [tapenum ];
1114+
1115+ /*
1116+ * The buffer size must be a multiple of BLCKSZ in size, so round the
1117+ * given value down to nearest BLCKSZ. Make sure we have at least one
1118+ * page. Also, don't go above MaxAllocSize, to avoid erroring out. A
1119+ * multi-gigabyte buffer is unlikely to be helpful, anyway.
1120+ */
1121+ if (avail_mem < BLCKSZ )
1122+ avail_mem = BLCKSZ ;
1123+ if (avail_mem > MaxAllocSize )
1124+ avail_mem = MaxAllocSize ;
1125+ avail_mem -= avail_mem %BLCKSZ ;
1126+ lt -> read_buffer_size = avail_mem ;
1127+ }