Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commite94568e

Browse files
committed
Change the way pre-reading in external sort's merge phase works.
Don't pre-read tuples into SortTuple slots during merge. Instead, use thememory for larger read buffers in logtape.c. We're doing the same numberof READTUP() calls either way, but managing the pre-read SortTuple slotsis much more complicated. Also, the on-tape representation is more compactthan SortTuples, so we can fit more pre-read tuples into the same amountof memory this way. And we have better cache-locality, when we use just asmall number of SortTuple slots.Now that we only hold one tuple from each tape in the SortTuple slots, wecan greatly simplify the "batch memory" management. We now maintain asmall set of fixed-sized slots, to hold the tuples, and fall back topalloc() for larger tuples. We use this method during all merge phases,not just the final merge, and also when randomAccess is requested, andalso in the TSS_SORTEDONTAPE case. In other words, it's used whenever wedo an external sort.Reviewed by Peter Geoghegan and Claudio Freire.Discussion: <CAM3SWZTpaORV=yQGVCG8Q4axcZ3MvF-05xe39ZvORdU9JcD6hQ@mail.gmail.com>
1 parente8bdee2 commite94568e

File tree

3 files changed

+574
-797
lines changed

3 files changed

+574
-797
lines changed

‎src/backend/utils/sort/logtape.c

Lines changed: 132 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -52,12 +52,17 @@
5252
* not clear this helps much, but it can't hurt. (XXX perhaps a LIFO
5353
* policy for free blocks would be better?)
5454
*
55+
* To further make the I/Os more sequential, we can use a larger buffer
56+
* when reading, and read multiple blocks from the same tape in one go,
57+
* whenever the buffer becomes empty. LogicalTapeAssignReadBufferSize()
58+
* can be used to set the size of the read buffer.
59+
*
5560
* To support the above policy of writing to the lowest free block,
5661
* ltsGetFreeBlock sorts the list of free block numbers into decreasing
5762
* order each time it is asked for a block and the list isn't currently
5863
* sorted. This is an efficient way to handle it because we expect cycles
5964
* of releasing many blocks followed by re-using many blocks, due to
60-
*tuplesort.c's "preread" behavior.
65+
*the larger read buffer.
6166
*
6267
* Since all the bookkeeping and buffer memory is allocated with palloc(),
6368
* and the underlying file(s) are made with OpenTemporaryFile, all resources
@@ -79,6 +84,7 @@
7984

8085
#include"storage/buffile.h"
8186
#include"utils/logtape.h"
87+
#include"utils/memutils.h"
8288

8389
/*
8490
* Block indexes are "long"s, so we can fit this many per indirect block.
@@ -131,9 +137,18 @@ typedef struct LogicalTape
131137
* reading.
132138
*/
133139
char*buffer;/* physical buffer (separately palloc'd) */
140+
intbuffer_size;/* allocated size of the buffer */
134141
longcurBlockNumber;/* this block's logical blk# within tape */
135142
intpos;/* next read/write position in buffer */
136143
intnbytes;/* total # of valid bytes in buffer */
144+
145+
/*
146+
* Desired buffer size to use when reading. To keep things simple, we use
147+
* a single-block buffer when writing, or when reading a frozen tape. But
148+
* when we are reading and will only read forwards, we allocate a larger
149+
* buffer, determined by read_buffer_size.
150+
*/
151+
intread_buffer_size;
137152
}LogicalTape;
138153

139154
/*
@@ -227,6 +242,53 @@ ltsReadBlock(LogicalTapeSet *lts, long blocknum, void *buffer)
227242
blocknum)));
228243
}
229244

245+
/*
246+
* Read as many blocks as we can into the per-tape buffer.
247+
*
248+
* The caller can specify the next physical block number to read, in
249+
* datablocknum, or -1 to fetch the next block number from the internal block.
250+
* If datablocknum == -1, the caller must've already set curBlockNumber.
251+
*
252+
* Returns true if anything was read, 'false' on EOF.
253+
*/
254+
staticbool
255+
ltsReadFillBuffer(LogicalTapeSet*lts,LogicalTape*lt,longdatablocknum)
256+
{
257+
lt->pos=0;
258+
lt->nbytes=0;
259+
260+
do
261+
{
262+
/* Fetch next block number (unless provided by caller) */
263+
if (datablocknum==-1)
264+
{
265+
datablocknum=ltsRecallNextBlockNum(lts,lt->indirect,lt->frozen);
266+
if (datablocknum==-1L)
267+
break;/* EOF */
268+
lt->curBlockNumber++;
269+
}
270+
271+
/* Read the block */
272+
ltsReadBlock(lts,datablocknum, (void*) (lt->buffer+lt->nbytes));
273+
if (!lt->frozen)
274+
ltsReleaseBlock(lts,datablocknum);
275+
276+
if (lt->curBlockNumber<lt->numFullBlocks)
277+
lt->nbytes+=BLCKSZ;
278+
else
279+
{
280+
/* EOF */
281+
lt->nbytes+=lt->lastBlockBytes;
282+
break;
283+
}
284+
285+
/* Advance to next block, if we have buffer space left */
286+
datablocknum=-1;
287+
}while (lt->nbytes<lt->buffer_size);
288+
289+
return (lt->nbytes>0);
290+
}
291+
230292
/*
231293
* qsort comparator for sorting freeBlocks[] into decreasing order.
232294
*/
@@ -546,6 +608,8 @@ LogicalTapeSetCreate(int ntapes)
546608
lt->numFullBlocks=0L;
547609
lt->lastBlockBytes=0;
548610
lt->buffer=NULL;
611+
lt->buffer_size=0;
612+
lt->read_buffer_size=BLCKSZ;
549613
lt->curBlockNumber=0L;
550614
lt->pos=0;
551615
lt->nbytes=0;
@@ -628,14 +692,18 @@ LogicalTapeWrite(LogicalTapeSet *lts, int tapenum,
628692

629693
/* Allocate data buffer and first indirect block on first write */
630694
if (lt->buffer==NULL)
695+
{
631696
lt->buffer= (char*)palloc(BLCKSZ);
697+
lt->buffer_size=BLCKSZ;
698+
}
632699
if (lt->indirect==NULL)
633700
{
634701
lt->indirect= (IndirectBlock*)palloc(sizeof(IndirectBlock));
635702
lt->indirect->nextSlot=0;
636703
lt->indirect->nextup=NULL;
637704
}
638705

706+
Assert(lt->buffer_size==BLCKSZ);
639707
while (size>0)
640708
{
641709
if (lt->pos >=BLCKSZ)
@@ -709,18 +777,19 @@ LogicalTapeRewind(LogicalTapeSet *lts, int tapenum, bool forWrite)
709777
Assert(lt->frozen);
710778
datablocknum=ltsRewindFrozenIndirectBlock(lts,lt->indirect);
711779
}
780+
781+
/* Allocate a read buffer */
782+
if (lt->buffer)
783+
pfree(lt->buffer);
784+
lt->buffer=palloc(lt->read_buffer_size);
785+
lt->buffer_size=lt->read_buffer_size;
786+
712787
/* Read the first block, or reset if tape is empty */
713788
lt->curBlockNumber=0L;
714789
lt->pos=0;
715790
lt->nbytes=0;
716791
if (datablocknum!=-1L)
717-
{
718-
ltsReadBlock(lts,datablocknum, (void*)lt->buffer);
719-
if (!lt->frozen)
720-
ltsReleaseBlock(lts,datablocknum);
721-
lt->nbytes= (lt->curBlockNumber<lt->numFullBlocks) ?
722-
BLCKSZ :lt->lastBlockBytes;
723-
}
792+
ltsReadFillBuffer(lts,lt,datablocknum);
724793
}
725794
else
726795
{
@@ -754,6 +823,13 @@ LogicalTapeRewind(LogicalTapeSet *lts, int tapenum, bool forWrite)
754823
lt->curBlockNumber=0L;
755824
lt->pos=0;
756825
lt->nbytes=0;
826+
827+
if (lt->buffer)
828+
{
829+
pfree(lt->buffer);
830+
lt->buffer=NULL;
831+
lt->buffer_size=0;
832+
}
757833
}
758834
}
759835

@@ -779,20 +855,8 @@ LogicalTapeRead(LogicalTapeSet *lts, int tapenum,
779855
if (lt->pos >=lt->nbytes)
780856
{
781857
/* Try to load more data into buffer. */
782-
longdatablocknum=ltsRecallNextBlockNum(lts,lt->indirect,
783-
lt->frozen);
784-
785-
if (datablocknum==-1L)
858+
if (!ltsReadFillBuffer(lts,lt,-1))
786859
break;/* EOF */
787-
lt->curBlockNumber++;
788-
lt->pos=0;
789-
ltsReadBlock(lts,datablocknum, (void*)lt->buffer);
790-
if (!lt->frozen)
791-
ltsReleaseBlock(lts,datablocknum);
792-
lt->nbytes= (lt->curBlockNumber<lt->numFullBlocks) ?
793-
BLCKSZ :lt->lastBlockBytes;
794-
if (lt->nbytes <=0)
795-
break;/* EOF (possible here?) */
796860
}
797861

798862
nthistime=lt->nbytes-lt->pos;
@@ -842,6 +906,22 @@ LogicalTapeFreeze(LogicalTapeSet *lts, int tapenum)
842906
lt->writing= false;
843907
lt->frozen= true;
844908
datablocknum=ltsRewindIndirectBlock(lts,lt->indirect, true);
909+
910+
/*
911+
* The seek and backspace functions assume a single block read buffer.
912+
* That's OK with current usage. A larger buffer is helpful to make the
913+
* read pattern of the backing file look more sequential to the OS, when
914+
* we're reading from multiple tapes. But at the end of a sort, when a
915+
* tape is frozen, we only read from a single tape anyway.
916+
*/
917+
if (!lt->buffer||lt->buffer_size!=BLCKSZ)
918+
{
919+
if (lt->buffer)
920+
pfree(lt->buffer);
921+
lt->buffer=palloc(BLCKSZ);
922+
lt->buffer_size=BLCKSZ;
923+
}
924+
845925
/* Read the first block, or reset if tape is empty */
846926
lt->curBlockNumber=0L;
847927
lt->pos=0;
@@ -875,6 +955,7 @@ LogicalTapeBackspace(LogicalTapeSet *lts, int tapenum, size_t size)
875955
Assert(tapenum >=0&&tapenum<lts->nTapes);
876956
lt=&lts->tapes[tapenum];
877957
Assert(lt->frozen);
958+
Assert(lt->buffer_size==BLCKSZ);
878959

879960
/*
880961
* Easy case for seek within current block.
@@ -941,6 +1022,7 @@ LogicalTapeSeek(LogicalTapeSet *lts, int tapenum,
9411022
lt=&lts->tapes[tapenum];
9421023
Assert(lt->frozen);
9431024
Assert(offset >=0&&offset <=BLCKSZ);
1025+
Assert(lt->buffer_size==BLCKSZ);
9441026

9451027
/*
9461028
* Easy case for seek within current block.
@@ -1002,6 +1084,10 @@ LogicalTapeTell(LogicalTapeSet *lts, int tapenum,
10021084

10031085
Assert(tapenum >=0&&tapenum<lts->nTapes);
10041086
lt=&lts->tapes[tapenum];
1087+
1088+
/* With a larger buffer, 'pos' wouldn't be the same as offset within page */
1089+
Assert(lt->buffer_size==BLCKSZ);
1090+
10051091
*blocknum=lt->curBlockNumber;
10061092
*offset=lt->pos;
10071093
}
@@ -1014,3 +1100,28 @@ LogicalTapeSetBlocks(LogicalTapeSet *lts)
10141100
{
10151101
returnlts->nFileBlocks;
10161102
}
1103+
1104+
/*
1105+
* Set buffer size to use, when reading from given tape.
1106+
*/
1107+
void
1108+
LogicalTapeAssignReadBufferSize(LogicalTapeSet*lts,inttapenum,size_tavail_mem)
1109+
{
1110+
LogicalTape*lt;
1111+
1112+
Assert(tapenum >=0&&tapenum<lts->nTapes);
1113+
lt=&lts->tapes[tapenum];
1114+
1115+
/*
1116+
* The buffer size must be a multiple of BLCKSZ in size, so round the
1117+
* given value down to nearest BLCKSZ. Make sure we have at least one
1118+
* page. Also, don't go above MaxAllocSize, to avoid erroring out. A
1119+
* multi-gigabyte buffer is unlikely to be helpful, anyway.
1120+
*/
1121+
if (avail_mem<BLCKSZ)
1122+
avail_mem=BLCKSZ;
1123+
if (avail_mem>MaxAllocSize)
1124+
avail_mem=MaxAllocSize;
1125+
avail_mem-=avail_mem %BLCKSZ;
1126+
lt->read_buffer_size=avail_mem;
1127+
}

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp